Python classpath Examples

Programming Language: Python

Namespace/Package Name: triage.util.introspection

Method/Function: classpath

Examples at hotexamples.com: 5

Python classpath - 5 examples found. These are the top rated real world Python examples of triage.util.introspection.classpath extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: tracking.py Project: dssg/triage

def initialize_tracking_and_get_run_id(experiment_hash, experiment_class_path,
                                       random_seed, experiment_kwargs,
                                       db_engine):
    """Create a row in the TriageRun table with some initial info and return the created run_id

    Args:
        experiment_hash (str) An experiment hash that exists in the experiments table
        experiment_class_path (str) The name of the experiment subclass used
        random_seed (int) Random seed used to run the experiment
        experiment_kwargs (dict) Any runtime Experiment keyword arguments that should be saved
        db_engine (sqlalchemy.engine)
    """
    # Any experiment kwargs that are types (e.g. MatrixStorageClass) can't
    # be serialized, so just use the class name if so
    cleaned_experiment_kwargs = {
        k: (classpath(v) if isinstance(v, type) else v)
        for k, v in experiment_kwargs.items()
    }
    run = TriageRun(
        start_time=datetime.datetime.now(),
        git_hash=infer_git_hash(),
        triage_version=infer_triage_version(),
        python_version=infer_python_version(),
        run_type="experiment",
        run_hash=experiment_hash,
        last_updated_time=datetime.datetime.now(),
        current_status=TriageRunStatus.started,
        installed_libraries=infer_installed_libraries(),
        platform=platform.platform(),
        os_user=getpass.getuser(),
        working_directory=os.getcwd(),
        ec2_instance_type=infer_ec2_instance_type(),
        log_location=infer_log_location(),
        experiment_class_path=experiment_class_path,
        random_seed=random_seed,
        experiment_kwargs=cleaned_experiment_kwargs,
    )
    run_id = None
    with scoped_session(db_engine) as session:
        session.add(run)
        session.commit()
        run_id = run.run_id
    if not run_id:
        raise ValueError("Failed to retrieve run_id from saved row")
    return run_id

Example #2

Show file

File: base.py Project: silvrwolfboy/triage-1

    def __init__(
        self,
        config,
        db_engine,
        project_path=None,
        matrix_storage_class=CSVMatrixStore,
        replace=True,
        cleanup=False,
        cleanup_timeout=None,
        materialize_subquery_fromobjs=True,
        features_ignore_cohort=False,
        profile=False,
        save_predictions=True,
        skip_validation=False,
        partial_run=False,
    ):
        # For a partial run, skip validation and avoid cleaning up
        # we'll also skip filling default config values below
        if partial_run:
            cleanup = False
            skip_validation = True

        experiment_kwargs = bind_kwargs(
            self.__class__, **{
                key: value
                for (key, value) in locals().items()
                if key not in {'db_engine', 'config', 'self'}
            })

        self._check_config_version(config)
        self.config = config

        self.config['random_seed'] = self.config.get('random_seed',
                                                     random.randint(1, 1e7))

        random.seed(self.config['random_seed'])

        self.project_storage = ProjectStorage(project_path)
        self.model_storage_engine = ModelStorageEngine(self.project_storage)
        self.matrix_storage_engine = MatrixStorageEngine(
            self.project_storage, matrix_storage_class)
        self.project_path = project_path
        self.replace = replace
        self.save_predictions = save_predictions
        self.skip_validation = skip_validation
        self.db_engine = db_engine
        results_schema.upgrade_if_clean(dburl=self.db_engine.url)

        self.features_schema_name = "features"
        self.materialize_subquery_fromobjs = materialize_subquery_fromobjs
        self.features_ignore_cohort = features_ignore_cohort

        # only fill default values for full runs
        if not partial_run:
            ## Defaults to sane values
            self.config['temporal_config'] = fill_timechop_config_missing(
                self.config, self.db_engine)
            ## Defaults to all the entities found in the features_aggregation's from_obj
            self.config['cohort_config'] = fill_cohort_config_missing(
                self.config)
            ## Defaults to all the feature_aggregation's prefixes
            self.config[
                'feature_group_definition'] = fill_feature_group_definition(
                    self.config)

        grid_config = fill_model_grid_presets(self.config)
        self.config.pop('model_grid_preset', None)
        if grid_config is not None:
            self.config['grid_config'] = grid_config

        ###################### RUBICON ######################

        self.experiment_hash = save_experiment_and_get_hash(
            self.config, self.db_engine)
        self.run_id = initialize_tracking_and_get_run_id(
            self.experiment_hash,
            experiment_class_path=classpath(self.__class__),
            experiment_kwargs=experiment_kwargs,
            db_engine=self.db_engine)
        self.initialize_components()

        self.cleanup = cleanup
        if self.cleanup:
            logging.info(
                "cleanup is set to True, so intermediate tables (labels and cohort) "
                "will be removed after matrix creation and subset tables will be "
                "removed after model training and testing")
        else:
            logging.info(
                "cleanup is set to False, so intermediate tables (labels, cohort, and subsets) "
                "will not be removed")
        self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None
                                else cleanup_timeout)
        self.profile = profile
        logging.info("Generate profiling stats? (profile option): %s",
                     self.profile)

Example #3

Show file

    def __init__(
        self,
        config,
        db_engine,
        project_path=None,
        matrix_storage_class=CSVMatrixStore,
        replace=True,
        cleanup=False,
        cleanup_timeout=None,
        materialize_subquery_fromobjs=True,
        features_ignore_cohort=False,
        additional_bigtrain_classnames=None,
        profile=False,
        save_predictions=True,
        skip_validation=False,
        partial_run=False,
    ):
        # For a partial run, skip validation and avoid cleaning up
        # we'll also skip filling default config values below
        if partial_run:
            cleanup = False
            skip_validation = True

        experiment_kwargs = bind_kwargs(
            self.__class__,
            **{
                key: value
                for (key, value) in locals().items()
                if key not in {"db_engine", "config", "self"}
            },
        )

        self._check_config_version(config)
        self.config = config

        if self.config.get("cohort_config") is not None:
            self.config["cohort_config"] = load_query_if_needed(
                self.config["cohort_config"]
            )
        if self.config.get("label_config") is not None:
            self.config["label_config"] = load_query_if_needed(
                self.config["label_config"]
            )

        self.project_storage = ProjectStorage(project_path)
        self.model_storage_engine = ModelStorageEngine(self.project_storage)
        self.matrix_storage_engine = MatrixStorageEngine(
            self.project_storage, matrix_storage_class
        )
        self.project_path = project_path
        logger.verbose(
            f"Matrices and trained models will be saved in {self.project_path}"
        )
        self.replace = replace
        if self.replace:
            logger.notice(
                f"Replace flag is set to true. Matrices, models, "
                "evaluations and predictions (if they exist) will be replaced"
            )

        self.save_predictions = save_predictions
        if not self.save_predictions:
            logger.notice(
                f"Save predictions flag is set to false. "
                "Individual predictions won't be stored in the predictions "
                "table. This will decrease both the running time "
                "of an experiment and also decrease the space needed in the db"
            )

        self.skip_validation = skip_validation
        if self.skip_validation:
            logger.notice(
                f"Warning: Skip validation flag is set to true. "
                "The experiment config file specified won't be validated. "
                "This will reduce (a little) the running time of the experiment, "
                "but has some potential risks, e.g. the experiment could fail"
                "after some time due to some misconfiguration. Proceed with care."
            )

        self.db_engine = db_engine
        results_schema.upgrade_if_clean(dburl=self.db_engine.url)

        self.features_schema_name = "features"

        self.materialize_subquery_fromobjs = materialize_subquery_fromobjs
        if not self.materialize_subquery_fromobjs:
            logger.notice(
                "Materialize from_objs is set to false. "
                "The from_objs will be calculated on the fly every time."
            )

        self.features_ignore_cohort = features_ignore_cohort
        if self.features_ignore_cohort:
            logger.notice(
                "Features will be calculated for all the entities "
                "(i.e. ignoring cohort) this setting will have the effect "
                "that more db space will be used, but potentially could save "
                "time if you are running several similar experiments with "
                "different cohorts."
            )

        self.additional_bigtrain_classnames = additional_bigtrain_classnames
        # only fill default values for full runs
        if not partial_run:
            ## Defaults to sane values
            self.config["temporal_config"] = fill_timechop_config_missing(
                self.config, self.db_engine
            )
            ## Defaults to all the entities found in the features_aggregation's from_obj
            self.config["cohort_config"] = fill_cohort_config_missing(self.config)
            ## Defaults to all the feature_aggregation's prefixes
            self.config["feature_group_definition"] = fill_feature_group_definition(
                self.config
            )

        grid_config = fill_model_grid_presets(self.config)
        self.config.pop("model_grid_preset", None)
        if grid_config is not None:
            self.config["grid_config"] = grid_config

        if not self.config.get("random_seed", None):
            logger.notice(
                "Random seed not specified. A random seed will be provided. "
                "This could have interesting side effects, "
                "e.g. new models per model group are trained, "
                "tested and evaluated everytime that you run this experiment configuration"
            )

        self.random_seed = self.config.pop("random_seed", random.randint(1, 1e7))

        logger.verbose(
            f"Using random seed [{self.random_seed}] for running the experiment"
        )
        random.seed(self.random_seed)

        ###################### RUBICON ######################

        self.experiment_hash = save_experiment_and_get_hash(self.config, self.db_engine)
        logger.debug(f"Experiment hash [{self.experiment_hash}] assigned")
        self.run_id = initialize_tracking_and_get_run_id(
            self.experiment_hash,
            experiment_class_path=classpath(self.__class__),
            random_seed=self.random_seed,
            experiment_kwargs=experiment_kwargs,
            db_engine=self.db_engine,
        )
        logger.debug(f"Experiment run id [{self.run_id}] assigned")

        self.initialize_components()

        self.cleanup = cleanup
        if self.cleanup:
            logger.notice(
                "Cleanup is set to true, so intermediate tables (labels and cohort) "
                "will be removed after matrix creation and subset tables will be "
                "removed after model training and testing"
            )

        self.cleanup_timeout = (
            self.cleanup_timeout if cleanup_timeout is None else cleanup_timeout
        )

        self.profile = profile
        if self.profile:
            logger.spam("Profiling will be stored using cProfile")

Example #4

Show file

    def __init__(
        self,
        config,
        db_engine,
        project_path=None,
        matrix_storage_class=CSVMatrixStore,
        replace=True,
        cleanup=False,
        cleanup_timeout=None,
        materialize_subquery_fromobjs=True,
        features_ignore_cohort=False,
        profile=False,
        save_predictions=True,
        skip_validation=False,
    ):
        experiment_kwargs = bind_kwargs(
            self.__class__, **{
                key: value
                for (key, value) in locals().items()
                if key not in {'db_engine', 'config', 'self'}
            })

        self._check_config_version(config)
        self.config = config
        random.seed(config['random_seed'])

        self.project_storage = ProjectStorage(project_path)
        self.model_storage_engine = ModelStorageEngine(self.project_storage)
        self.matrix_storage_engine = MatrixStorageEngine(
            self.project_storage, matrix_storage_class)
        self.project_path = project_path
        self.replace = replace
        self.save_predictions = save_predictions
        self.skip_validation = skip_validation
        self.db_engine = db_engine
        results_schema.upgrade_if_clean(dburl=self.db_engine.url)

        self.features_schema_name = "features"
        self.materialize_subquery_fromobjs = materialize_subquery_fromobjs
        self.features_ignore_cohort = features_ignore_cohort
        self.experiment_hash = save_experiment_and_get_hash(
            self.config, self.db_engine)
        self.run_id = initialize_tracking_and_get_run_id(
            self.experiment_hash,
            experiment_class_path=classpath(self.__class__),
            experiment_kwargs=experiment_kwargs,
            db_engine=self.db_engine)
        self.initialize_components()

        self.cleanup = cleanup
        if self.cleanup:
            logging.info(
                "cleanup is set to True, so intermediate tables (labels and cohort) "
                "will be removed after matrix creation and subset tables will be "
                "removed after model training and testing")
        else:
            logging.info(
                "cleanup is set to False, so intermediate tables (labels, cohort, and subsets) "
                "will not be removed")
        self.cleanup_timeout = (self.cleanup_timeout if cleanup_timeout is None
                                else cleanup_timeout)
        self.profile = profile
        logging.info("Generate profiling stats? (profile option): %s",
                     self.profile)

Example #5

Show file

File: __init__.py Project: dssg/triage

    def retrain(self, prediction_date):
        """Retrain a model by going back one split from prediction_date, so the as_of_date for training would be (prediction_date - training_label_timespan)
        
        Args:
            prediction_date(str) 
        """
        # Retrain config and hash
        retrain_config = {
            "model_group_id": self.model_group_id,
            "prediction_date": prediction_date,
            "test_label_timespan": self.test_label_timespan,
            "test_duration": self.test_duration,
        }
        self.retrain_hash = save_retrain_and_get_hash(retrain_config,
                                                      self.db_engine)

        with get_for_update(self.db_engine, Retrain,
                            self.retrain_hash) as retrain:
            retrain.prediction_date = prediction_date

        # Timechop
        prediction_date = dt_from_str(prediction_date)
        temporal_config = self.get_temporal_config_for_retrain(prediction_date)
        timechopper = Timechop(**temporal_config)
        chops = timechopper.chop_time()
        assert len(chops) == 1
        chops_train_matrix = chops[0]['train_matrix']
        as_of_date = datetime.strftime(chops_train_matrix['last_as_of_time'],
                                       "%Y-%m-%d")
        retrain_definition = {
            'first_as_of_time':
            chops_train_matrix['first_as_of_time'],
            'last_as_of_time':
            chops_train_matrix['last_as_of_time'],
            'matrix_info_end_time':
            chops_train_matrix['matrix_info_end_time'],
            'as_of_times': [as_of_date],
            'training_label_timespan':
            chops_train_matrix['training_label_timespan'],
            'max_training_history':
            chops_train_matrix['max_training_history'],
            'training_as_of_date_frequency':
            chops_train_matrix['training_as_of_date_frequency'],
        }

        # Set ExperimentRun
        run = TriageRun(
            start_time=datetime.now(),
            git_hash=infer_git_hash(),
            triage_version=infer_triage_version(),
            python_version=infer_python_version(),
            run_type="retrain",
            run_hash=self.retrain_hash,
            last_updated_time=datetime.now(),
            current_status=TriageRunStatus.started,
            installed_libraries=infer_installed_libraries(),
            platform=platform.platform(),
            os_user=getpass.getuser(),
            working_directory=os.getcwd(),
            ec2_instance_type=infer_ec2_instance_type(),
            log_location=infer_log_location(),
            experiment_class_path=classpath(self.__class__),
            random_seed=retrieve_experiment_seed_from_run_id(
                self.db_engine, self.triage_run_id),
        )
        run_id = None
        with scoped_session(self.db_engine) as session:
            session.add(run)
            session.commit()
            run_id = run.run_id
        if not run_id:
            raise ValueError("Failed to retrieve run_id from saved row")

        # set ModelTrainer's run_id and experiment_hash for Retrain run
        self.model_trainer.run_id = run_id
        self.model_trainer.experiment_hash = self.retrain_hash

        # 1. Generate all labels
        self.generate_all_labels(as_of_date)
        record_labels_table_name(run_id, self.db_engine,
                                 self.labels_table_name)

        # 2. Generate cohort
        cohort_table_name = f"triage_production.cohort_{self.experiment_config['cohort_config']['name']}_retrain"
        self.generate_entity_date_table(as_of_date, cohort_table_name)
        record_cohort_table_name(run_id, self.db_engine, cohort_table_name)

        # 3. Generate feature aggregations
        collate_aggregations = self.get_collate_aggregations(
            as_of_date, cohort_table_name)
        feature_aggregation_table_tasks = self.feature_generator.generate_all_table_tasks(
            collate_aggregations, task_type='aggregation')
        self.feature_generator.process_table_tasks(
            feature_aggregation_table_tasks)

        # 4. Reconstruct feature disctionary from feature_names and generate imputation
        reconstructed_feature_dict, imputation_table_tasks = self.get_feature_dict_and_imputation_task(
            collate_aggregations,
            self.model_group_info['model_id_last_split'],
        )
        feature_group_creator = FeatureGroupCreator(
            self.experiment_config['feature_group_definition'])
        feature_group_mixer = FeatureGroupMixer(["all"])
        feature_group_dict = feature_group_mixer.generate(
            feature_group_creator.subsets(reconstructed_feature_dict))[0]
        self.feature_generator.process_table_tasks(imputation_table_tasks)
        # 5. Build new matrix
        db_config = {
            "features_schema_name": "triage_production",
            "labels_schema_name": "public",
            "cohort_table_name": cohort_table_name,
            "labels_table_name": self.labels_table_name,
        }

        record_matrix_building_started(run_id, self.db_engine)
        matrix_builder = MatrixBuilder(
            db_config=db_config,
            matrix_storage_engine=self.matrix_storage_engine,
            engine=self.db_engine,
            experiment_hash=None,
            replace=True,
        )
        new_matrix_metadata = Planner.make_metadata(
            matrix_definition=retrain_definition,
            feature_dictionary=feature_group_dict,
            label_name=self.label_name,
            label_type='binary',
            cohort_name=self.cohort_name,
            matrix_type='train',
            feature_start_time=dt_from_str(self.feature_start_time),
            user_metadata=self.user_metadata,
        )

        new_matrix_metadata['matrix_id'] = "_".join([
            self.label_name,
            'binary',
            str(as_of_date),
            'retrain',
        ])

        matrix_uuid = filename_friendly_hash(new_matrix_metadata)
        matrix_builder.build_matrix(
            as_of_times=[as_of_date],
            label_name=self.label_name,
            label_type='binary',
            feature_dictionary=feature_group_dict,
            matrix_metadata=new_matrix_metadata,
            matrix_uuid=matrix_uuid,
            matrix_type="train",
        )
        retrain_model_comment = 'retrain_' + str(datetime.now())

        misc_db_parameters = {
            'train_end_time': dt_from_str(as_of_date),
            'test': False,
            'train_matrix_uuid': matrix_uuid,
            'training_label_timespan': self.training_label_timespan,
            'model_comment': retrain_model_comment,
        }

        # get the random seed from the last split
        last_split_train_matrix_uuid, last_split_matrix_metadata = train_matrix_info_from_model_id(
            self.db_engine,
            model_id=self.model_group_info['model_id_last_split'])

        random_seed = self.model_trainer.get_or_generate_random_seed(
            model_group_id=self.model_group_id,
            matrix_metadata=last_split_matrix_metadata,
            train_matrix_uuid=last_split_train_matrix_uuid)

        # create retrain model hash
        retrain_model_hash = self.model_trainer._model_hash(
            self.matrix_storage_engine.get_store(matrix_uuid).metadata,
            class_path=self.model_group_info['model_type'],
            parameters=self.model_group_info['hyperparameters'],
            random_seed=random_seed,
        )

        associate_models_with_retrain(self.retrain_hash,
                                      (retrain_model_hash, ), self.db_engine)

        record_model_building_started(run_id, self.db_engine)
        retrain_model_id = self.model_trainer.process_train_task(
            matrix_store=self.matrix_storage_engine.get_store(matrix_uuid),
            class_path=self.model_group_info['model_type'],
            parameters=self.model_group_info['hyperparameters'],
            model_hash=retrain_model_hash,
            misc_db_parameters=misc_db_parameters,
            random_seed=random_seed,
            retrain=True,
            model_group_id=self.model_group_id)

        self.retrain_model_hash = retrieve_model_hash_from_id(
            self.db_engine, retrain_model_id)
        self.retrain_matrix_uuid = matrix_uuid
        self.retrain_model_id = retrain_model_id
        return {
            'retrain_model_comment': retrain_model_comment,
            'retrain_model_id': retrain_model_id
        }