コード例 #1
0
 def _verify_dataset_integrity(self):
     new_hash = compute_dataset_hash(self.dataset)
     if self.__dataset_hash != new_hash:
         print("Old hash: {}".format(self.__dataset_hash), file=sys.stderr)
         print("New hash: {}".format(new_hash), file=sys.stderr)
         #TODO exception handling
         self._reload_dataset()
コード例 #2
0
    def __init__(self,
                 problem_id,
                 username,
                 orm,
                 dataset={},
                 target=None,
                 entities_featurized=None):
        self.problem_id = problem_id
        self.username = username
        self.orm = orm
        self.dataset = dataset
        self.target = target
        self.entities_featurized = entities_featurized

        if self.dataset:
            self.__dataset_hash = compute_dataset_hash(self.dataset)
        else:
            self.__dataset_hash = None
コード例 #3
0
    def _load_dataset_split(self,
                            split="train",
                            dataset={},
                            entities_featurized=None,
                            target=None,
                            dataset_hash=None,
                            compute_hash=True):
        # query db for import parameters to load files
        is_present_dataset = bool(dataset)
        is_present_entities_featurized = not pd.DataFrame(
            entities_featurized).empty
        is_present_target = not pd.DataFrame(target).empty
        is_anything_missing = not all([
            is_present_dataset, is_present_entities_featurized,
            is_present_target
        ])

        if is_anything_missing:
            with self.orm.session_scope() as session:
                problem = session.query(Problem)\
                        .filter(Problem.id == self.problem_id).one()
                problem_data_dir = getattr(problem,
                                           "data_dir_{}".format(split))
                problem_files = json.loads(problem.files)
                problem_table_names = json.loads(problem.table_names)
                problem_entities_featurized_table_name = \
                    problem.entities_featurized_table_name
                problem_target_table_name = problem.target_table_name

        # load entities and other tables
        if not is_present_dataset:
            # load other tables
            for (table_name, filename) in zip(problem_table_names,
                                              problem_files):
                if table_name == problem_entities_featurized_table_name or \
                   table_name == problem_target_table_name:
                    continue
                abs_filename = os.path.join(problem_data_dir, filename)
                dataset[table_name] = pd.read_csv(abs_filename,
                                                  low_memory=False,
                                                  header=0)

                # compute/recompute hash
                if compute_hash:
                    dataset_hash = compute_dataset_hash(dataset)
                else:
                    dataset_hash = None

        # recompute dataset hash. condition only met if we dataset has already
        # loaded, but dataset hash had not been computed. (because we just
        # computed hash several lines above!)
        if compute_hash:
            if not dataset_hash:
                dataset_hash = compute_dataset_hash(dataset)

        # load entities featurized
        if not is_present_entities_featurized:
            # if empty string, we simply don't have any features to add
            if problem_entities_featurized_table_name:
                cols = list(problem_table_names)
                ind_features = cols.index(
                    problem_entities_featurized_table_name)
                abs_filename = os.path.join(problem_data_dir,
                                            problem_files[ind_features])
                entities_featurized = pd.read_csv(abs_filename,
                                                  low_memory=False,
                                                  header=0)

        # load target
        if not is_present_target:
            cols = list(problem_table_names)
            ind_target = cols.index(problem_target_table_name)
            abs_filename = os.path.join(problem_data_dir,
                                        problem_files[ind_target])

            # target might not exist if we are making predictions on unseen
            # test data
            if os.path.exists(abs_filename):
                target = pd.read_csv(abs_filename, low_memory=False, header=0)
            else:
                target = None

        return dataset, entities_featurized, target, dataset_hash