Esempio n. 1
0
    def __init__(self, env, name="session"):
        """
        Constructor for Holoclean session
        :param env: Holoclean environment
        :param name: Name for the Holoclean session
        """
        # use DEBUG logging level if verbose enabled
        if env['verbose']:
            root_logger.setLevel(logging.DEBUG)
            gensim_logger.setLevel(logging.DEBUG)

        logging.debug('initiating session with parameters: %s', env)

        # Initialize random seeds.
        random.seed(env['seed'])
        torch.manual_seed(env['seed'])
        np.random.seed(seed=env['seed'])

        # Initialize members
        self.name = name
        self.env = env
        self.ds = Dataset(name, env)
        self.dc_parser = Parser(env, self.ds)
        self.domain_engine = DomainEngine(env, self.ds)
        self.detect_engine = DetectEngine(env, self.ds)
        self.repair_engine = RepairEngine(env, self.ds)
        self.eval_engine = EvalEngine(env, self.ds)
Esempio n. 2
0
    def __init__(self, env, name="session"):
        """
        Constructor for Holoclean session
        :param env: Holoclean environment
        :param name: Name for the Holoclean session
        """

        # Initialize members
        self.name = name
        self.env = env
        self.ds = Dataset(name, env)
        self.dc_parser = Parser(env, self.ds)
        self.domain_engine = DomainEngine(env, self.ds)
        self.detect_engine = DetectEngine(env, self.ds)
        self.repair_engine = RepairEngine(env, self.ds)
        self.eval_engine = EvalEngine(env, self.ds)

        # use DEBUG logging level if verbose enabled
        root_logger = logging.getLogger()
        gensim_logger = logging.getLogger('gensim')
        root_level, gensim_level = logging.INFO, logging.WARNING
        if self.env['verbose']:
            root_level, gensim_level = logging.DEBUG, logging.DEBUG
        root_logger.setLevel(root_level)
        gensim_logger.setLevel(gensim_level)
Esempio n. 3
0
    def __init__(self, env, name="session"):
        """
        Constructor for Holoclean session
        :param env: Holoclean environment
        :param name: Name for the Holoclean session
        """

        # Initialize members
        self.name = name
        self.env = env
        self.ds = Dataset(name, env)
        self.dc_parser = Parser(env, self.ds)
        self.domain_engine = DomainEngine(env, self.ds)
        self.detect_engine = DetectEngine(env, self.ds)
        self.repair_engine = RepairEngine(env, self.ds)
        self.eval_engine = EvalEngine(env, self.ds)
Esempio n. 4
0
class Session:
    """
    Session class controls the entire pipeline of HC
    """
    def __init__(self, env, name="session"):
        """
        Constructor for Holoclean session
        :param env: Holoclean environment
        :param name: Name for the Holoclean session
        """
        # use DEBUG logging level if verbose enabled
        if env['verbose']:
            root_logger.setLevel(logging.DEBUG)
            gensim_logger.setLevel(logging.DEBUG)

        logging.debug('initiating session with parameters: %s', env)

        # Initialize random seeds.
        random.seed(env['seed'])
        torch.manual_seed(env['seed'])
        np.random.seed(seed=env['seed'])

        # Initialize members
        self.name = name
        self.env = env
        self.ds = Dataset(name, env)
        self.dc_parser = Parser(env, self.ds)
        self.domain_engine = DomainEngine(env, self.ds)
        self.detect_engine = DetectEngine(env, self.ds)
        self.repair_engine = RepairEngine(env, self.ds)
        self.eval_engine = EvalEngine(env, self.ds)

    def load_data(self,
                  name,
                  fpath,
                  na_values=None,
                  entity_col=None,
                  src_col=None):
        """
        load_data takes the filepath to a CSV file to load as the initial dataset.
        :param name: (str) name to initialize dataset with.
        :param fpath: (str) filepath to CSV file.
        :param na_values: (str) value that identifies a NULL value
        :param entity_col: (st) column containing the unique
            identifier/ID of an entity.  For fusion tasks, rows with
            the same ID will be fused together in the output.
            If None, assumes every row is a unique entity.
        :param src_col: (str) if not None, for fusion tasks
            specifies the column containing the source for each "mention" of an
            entity.
        """
        status, load_time = self.ds.load_data(name,
                                              fpath,
                                              na_values=na_values,
                                              entity_col=entity_col,
                                              src_col=src_col)
        logging.info(status)
        logging.debug('Time to load dataset: %.2f secs', load_time)

    def load_dcs(self, fpath):
        """
        load_dcs ingests the Denial Constraints for initialized dataset.
        :param fpath: filepath to TXT file where each line contains one denial constraint.
        """
        status, load_time = self.dc_parser.load_denial_constraints(fpath)
        logging.info(status)
        logging.debug('Time to load dirty data: %.2f secs', load_time)

    def get_dcs(self):
        return self.dc_parser.get_dcs()

    def detect_errors(self, detect_list):
        status, detect_time = self.detect_engine.detect_errors(detect_list)
        logging.info(status)
        logging.debug('Time to detect errors: %.2f secs', detect_time)

    def setup_domain(self):
        status, domain_time = self.domain_engine.setup()
        logging.info(status)
        logging.debug('Time to setup the domain: %.2f secs', domain_time)

    def repair_errors(self, featurizers):
        status, feat_time = self.repair_engine.setup_featurized_ds(featurizers)
        logging.info(status)
        logging.debug('Time to featurize data: %.2f secs', feat_time)
        status, setup_time = self.repair_engine.setup_repair_model()
        logging.info(status)
        logging.debug('Time to setup repair model: %.2f secs', feat_time)
        status, fit_time = self.repair_engine.fit_repair_model()
        logging.info(status)
        logging.debug('Time to fit repair model: %.2f secs', fit_time)
        status, infer_time = self.repair_engine.infer_repairs()
        logging.info(status)
        logging.debug('Time to infer correct cell values: %.2f secs',
                      infer_time)
        status, time = self.ds.get_inferred_values()
        logging.info(status)
        logging.debug('Time to collect inferred values: %.2f secs', time)
        status, time = self.ds.get_repaired_dataset()
        logging.info(status)
        logging.debug('Time to store repaired dataset: %.2f secs', time)
        if self.env['print_fw']:
            status, time = self.repair_engine.get_featurizer_weights()
            logging.info(status)
            logging.debug('Time to store featurizer weights: %.2f secs', time)
            return status

    def evaluate(self, fpath, tid_col, attr_col, val_col, na_values=None):
        """
        evaluate generates an evaluation report with metrics (e.g. precision,
        recall) given a test set.
        :param fpath: (str) filepath to test set (ground truth) CSV file.
        :param tid_col: (str) column in CSV that corresponds to the TID.
        :param attr_col: (str) column in CSV that corresponds to the attribute.
        :param val_col: (str) column in CSV that corresponds to correct value
            for the current TID and attribute (i.e. cell).
        :param na_values: (Any) how na_values are represented in the data.
        Returns an EvalReport named tuple containing the experiment results.
        """
        name = self.ds.raw_data.name + '_clean'
        print("name")
        status, load_time = self.eval_engine.load_data(name,
                                                       fpath,
                                                       tid_col,
                                                       attr_col,
                                                       val_col,
                                                       na_values=na_values)
        logging.info(status)
        logging.debug('Time to evaluate repairs: %.2f secs', load_time)
        status, report_time, eval_report = self.eval_engine.eval_report()
        logging.info(status)
        logging.debug('Time to generate report: %.2f secs', report_time)
        return eval_report

    def explain_repairs(self, detectors):
        """
        TODO Document

        :param detectors: (list) of ErrorDetector objects
        """
        graph_time = self.explain_engine.build_constraints_graphs()
        logging.debug('Time to build constraints graph: %.2f secs', graph_time)
        explanation, explain_time = self.explain_engine.explain_repairs(
            detectors)
        logging.info(explanation)
        logging.debug('Time to generate explanation: %.2f secs', explain_time)
        for e in explanation:
            self.explain_engine.plot_explanation(e)
        return explanation
Esempio n. 5
0
class Session:
    """
    Session class controls the entire pipeline of HC
    """
    def __init__(self, env, name="session"):
        """
        Constructor for Holoclean session
        :param env: Holoclean environment
        :param name: Name for the Holoclean session
        """
        # use DEBUG logging level if verbose enabled
        if env['verbose']:
            root_logger.setLevel(logging.DEBUG)
            gensim_logger.setLevel(logging.DEBUG)

        logging.debug('initiating session with parameters: %s', env)

        # Initialize random seeds.
        random.seed(env['seed'])
        torch.manual_seed(env['seed'])
        np.random.seed(seed=env['seed'])

        # Initialize members
        self.name = name
        self.env = env
        self.ds = Dataset(name, env)
        self.dc_parser = Parser(env, self.ds)
        self.domain_engine = DomainEngine(env, self.ds)
        self.detect_engine = DetectEngine(env, self.ds)
        self.repair_engine = RepairEngine(env, self.ds)
        self.eval_engine = EvalEngine(env, self.ds)

    def load_data(self,
                  name,
                  fpath,
                  na_values=None,
                  entity_col=None,
                  src_col=None,
                  exclude_attr_cols=None,
                  numerical_attrs=None):
        """
        load_data takes the filepath to a CSV file to load as the initial dataset.

        :param name: (str) name to initialize dataset with.
        :param fpath: (str) filepath to CSV file.
        :param na_values: (str) value that identifies a NULL value
        :param entity_col: (st) column containing the unique
            identifier/ID of an entity.  For fusion tasks, rows with
            the same ID will be fused together in the output.
            If None, assumes every row is a unique entity.
        :param src_col: (str) if not None, for fusion tasks
            specifies the column containing the source for each "mention" of an
            entity.
        :param exclude_attr_cols: (str list)
        :param numerical_attrs: (str list)
        """
        status, load_time = self.ds.load_data(
            name,
            fpath,
            na_values=na_values,
            entity_col=entity_col,
            src_col=src_col,
            exclude_attr_cols=exclude_attr_cols,
            numerical_attrs=numerical_attrs)
        logging.info(status)
        logging.debug('Time to load dataset: %.2f secs', load_time)

    def load_dcs(self, fpath):
        """
        load_dcs ingests the Denial Constraints for initialized dataset.

        :param fpath: filepath to TXT file where each line contains one denial constraint.
        """
        status, load_time = self.dc_parser.load_denial_constraints(fpath)
        logging.info(status)
        logging.debug('Time to load dirty data: %.2f secs', load_time)

    def get_dcs(self):
        return self.dc_parser.get_dcs()

    def detect_errors(self, detect_list):
        status, detect_time = self.detect_engine.detect_errors(detect_list)
        logging.info(status)
        logging.debug('Time to detect errors: %.2f secs', detect_time)

    def disable_quantize(self):
        self.do_quantization = False
        self.ds.do_quantization = False
        self.domain_engine.do_quantization = False

    def quantize_numericals(self, num_attr_groups_bins):
        """
        :param num_attr_groups_bins: list[tuple] where each tuple consists of
        (# of bins, list[str]) where the list[str] is a group of attribues to be
        treated as numerical.
        """
        self.do_quantization = True
        self.ds.do_quantization = True
        self.domain_engine.do_quantization = True

        status, quantize_time, quantized_data = \
            quantize_km(self.env, self.ds.get_raw_data(), num_attr_groups_bins)

        logging.info(status)
        logging.debug('Time to quantize the dataset: %.2f secs' %
                      quantize_time)

        self.load_quantized_data(quantized_data)

        return quantized_data

    def load_quantized_data(self, df):
        tic = time.time()
        name = self.ds.raw_data.name + '_quantized'
        self.ds.quantized_data = Table(name, Source.DF, df=df)

        # Re-store to DB, ensuring numerical values are stored as floats.
        df_correct_type = df.copy()
        for attr in self.ds.numerical_attrs:
            df_correct_type.loc[df_correct_type[attr] == NULL_REPR,
                                attr] = np.nan
            df_correct_type[attr] = df_correct_type[attr].astype(float)
        df_correct_type.to_sql(name,
                               self.ds.engine.engine,
                               if_exists='replace',
                               index=False,
                               index_label=None)

        for attr in self.ds.quantized_data.get_attributes():
            self.ds.quantized_data.create_db_index(self.ds.engine, [attr])
        logging.debug('Time to load quantized dataset: %.2f secs' %
                      (time.time() - tic))

    def generate_domain(self):
        status, domain_time = self.domain_engine.setup()
        logging.info(status)
        logging.debug('Time to generate the domain: %.2f secs', domain_time)

    def run_estimator(self):
        """
        Uses estimator to weak label and prune domain.
        """
        self.domain_engine.run_estimator()

    def repair_errors(self, featurizers):
        return self._repair_errors(featurizers)

    def repair_validate_errors(self,
                               featurizers,
                               fpath,
                               tid_col,
                               attr_col,
                               val_col,
                               validate_period,
                               na_values=None):
        return self._repair_errors(featurizers, fpath, tid_col, attr_col,
                                   val_col, na_values, validate_period)

    def _repair_errors(self,
                       featurizers,
                       fpath=None,
                       tid_col=None,
                       attr_col=None,
                       val_col=None,
                       na_values=None,
                       validate_period=None):
        """
        Repair errors and optionally runs validation set per epoch.

        Must specify the following parameters if validation required:

        :param fpath: (str) filepath to test set (ground truth) CSV file.
        :param tid_col: (str) column in CSV that corresponds to the TID.
        :param attr_col: (str) column in CSV that corresponds to the attribute.
        :param val_col: (str) column in CSV that corresponds to correct value
            for the current TID and attribute (i.e. cell).
        :param na_values: (Any) how na_values are represented in the data.
        :param validate_period: (int) perform validation every nth epoch.
        """
        status, feat_time = self.repair_engine.setup_featurized_ds(featurizers)
        logging.info(status)
        logging.debug('Time to featurize data: %.2f secs', feat_time)
        status, setup_time = self.repair_engine.setup_repair_model()
        logging.info(status)
        logging.debug('Time to setup repair model: %.2f secs', feat_time)

        # If validation fpath provided, fit and validate
        if fpath is None:
            status, fit_time = self.repair_engine.fit_repair_model()
        else:
            # Set up validation set
            name = self.ds.raw_data.name + '_clean'
            status, load_time = self.eval_engine.load_data(name,
                                                           fpath,
                                                           tid_col,
                                                           attr_col,
                                                           val_col,
                                                           na_values=na_values)
            logging.info(status)
            logging.debug('Time to evaluate repairs: %.2f secs', load_time)

            status, fit_time = self.repair_engine.fit_validate_repair_model(
                self.eval_engine, validate_period)

        logging.info(status)
        logging.debug('Time to fit repair model: %.2f secs', fit_time)
        status, infer_time = self.repair_engine.infer_repairs()
        logging.info(status)
        logging.debug('Time to infer correct cell values: %.2f secs',
                      infer_time)
        status, time = self.ds.get_inferred_values()
        logging.info(status)
        logging.debug('Time to collect inferred values: %.2f secs', time)
        status, time = self.ds.get_repaired_dataset()
        logging.info(status)
        logging.debug('Time to store repaired dataset: %.2f secs', time)
        if self.env['print_fw']:
            status, time = self.repair_engine.get_featurizer_weights()
            logging.info(status)
            logging.debug('Time to store featurizer weights: %.2f secs', time)
            return status

    def evaluate(self, fpath, tid_col, attr_col, val_col, na_values=None):
        """
        evaluate generates an evaluation report with metrics (e.g. precision,
        recall) given a test set.

        :param fpath: (str) filepath to test set (ground truth) CSV file.
        :param tid_col: (str) column in CSV that corresponds to the TID.
        :param attr_col: (str) column in CSV that corresponds to the attribute.
        :param val_col: (str) column in CSV that corresponds to correct value
            for the current TID and attribute (i.e. cell).
        :param na_values: (Any) how na_values are represented in the data.

        Returns an EvalReport named tuple containing the experiment results.
        """
        name = self.ds.raw_data.name + '_clean'
        status, load_time = self.eval_engine.load_data(name,
                                                       fpath,
                                                       tid_col,
                                                       attr_col,
                                                       val_col,
                                                       na_values=na_values)
        logging.info(status)
        logging.debug('Time to evaluate repairs: %.2f secs', load_time)
        status, report_time, eval_report = self.eval_engine.eval_report()
        logging.info(status)
        logging.debug('Time to generate report: %.2f secs', report_time)
        return eval_report

    def get_predictions(self):
        """
        Returns a dataframe with 3 columns:
            - tid, attribute, inferred_val, proba
        """

        query = """
        SELECT
            _tid_, attribute, inferred_val, prob
        FROM {dom}
        INNER JOIN {inf_vals} USING(_vid_)
        """.format(inf_vals=AuxTables.inf_values_idx.name,
                   dom=AuxTables.cell_domain.name)
        res = self.ds.engine.execute_query(query)
        df_preds = pd.DataFrame(
            res,
            columns=['tid', 'attribute', 'inferred_val', 'proba'],
            dtype=str)
        return df_preds
Esempio n. 6
0
class Session:
    """
    Session class controls the entire pipeline of HC
    """
    def __init__(self, env, name="session"):
        """
        Constructor for Holoclean session
        :param env: Holoclean environment
        :param name: Name for the Holoclean session
        """

        # Initialize members
        self.name = name
        self.env = env
        self.ds = Dataset(name, env)
        self.dc_parser = Parser(env, self.ds)
        self.domain_engine = DomainEngine(env, self.ds)
        self.detect_engine = DetectEngine(env, self.ds)
        self.repair_engine = RepairEngine(env, self.ds)
        self.eval_engine = EvalEngine(env, self.ds)

    def load_data(self, name, f_path, f_name, na_values=None):
        status, load_time = self.ds.load_data(name,
                                              f_path,
                                              f_name,
                                              na_values=na_values)
        print(status)
        if self.env['verbose']:
            print('Time to load dataset: %.2f secs' % load_time)

    def load_dcs(self, f_path, f_name):
        status, load_time = self.dc_parser.load_denial_constraints(
            f_path, f_name)
        print(status)
        if self.env['verbose']:
            print('Time to load dirty data: %.2f secs' % load_time)

    def get_dcs(self):
        return self.dc_parser.get_dcs()

    def detect_errors(self, detect_list):
        status, detect_time = self.detect_engine.detect_errors(detect_list)
        print(status)
        if self.env['verbose']:
            print('Time to detect errors: %.2f secs' % detect_time)

    def setup_domain(self):
        status, domain_time = self.domain_engine.setup()
        print(status)
        if self.env['verbose']:
            print('Time to setup the domain: %.2f secs' % domain_time)

    def repair_errors(self, featurizers):
        status, feat_time = self.repair_engine.setup_featurized_ds(featurizers)
        print(status)
        if self.env['verbose']:
            print('Time to featurize data: %.2f secs' % feat_time)
        status, setup_time = self.repair_engine.setup_repair_model()
        print(status)
        if self.env['verbose']:
            print('Time to setup repair model: %.2f secs' % feat_time)
        status, fit_time = self.repair_engine.fit_repair_model()
        print(status)
        if self.env['verbose']:
            print('Time to fit repair model: %.2f secs' % fit_time)
        status, infer_time = self.repair_engine.infer_repairs()
        print(status)
        if self.env['verbose']:
            print('Time to infer correct cell values: %.2f secs' % infer_time)
        status, time = self.ds.get_inferred_values()
        print(status)
        if self.env['verbose']:
            print('Time to collect inferred values: %.2f secs' % time)
        status, time = self.ds.get_repaired_dataset()
        print(status)
        if self.env['verbose']:
            print('Time to store repaired dataset: %.2f secs' % time)

    def evaluate(self,
                 f_path,
                 f_name,
                 get_tid,
                 get_attr,
                 get_value,
                 na_values=None):
        name = self.ds.raw_data.name + '_clean'
        status, load_time = self.eval_engine.load_data(name,
                                                       f_path,
                                                       f_name,
                                                       get_tid,
                                                       get_attr,
                                                       get_value,
                                                       na_values=na_values)
        print(status)
        if self.env['verbose']:
            print('Time to evaluate repairs: %.2f secs' % load_time)
        status, report_time = self.eval_engine.eval_report()
        print(status)
        if self.env['verbose']:
            print('Time to generate report: %.2f secs' % report_time)