Ejemplo n.º 1
0
    def output(self):
        """
        We output a log file specifying the model type
        """
        experiments_path = self.conf.get(self.pipeline_task, "experiments")
        with open(experiments_path) as experiments_file:
            experiments_dict = json.load(experiments_file)

        cur_exper = pg_sed.fill_experiment_defaults(
            experiments_dict[self.exper_id], self.conf, self.pipeline_task)

        features_dict = OrderedDict(
            (("subset_name",
              cur_exper["subset_name"]), ("features", cur_exper["features"]),
             ("preprocessing", cur_exper["preprocessing"])))

        features_basename = self.conf.get(self.pipeline_task,
                                          "features_basename")
        features_str = features_basename + \
                       pg_sed.hash_if_needed("".join(features_dict.values()))

        responses_basename = self.conf.get(self.pipeline_task,
                                           "responses_basename")
        responses_str = responses_basename + \
                        pg_sed.hash_if_needed(cur_exper["subset_name"])
        basenames = {
            "train_features": "%s_train_%s" % (features_str, self.cur_fold),
            "train_responses": "%s_train_%s" % (responses_str, self.cur_fold),
            "test_features": "%s_test_%s" % (features_str, self.cur_fold),
            "test_responses": "%s_test_%s" % (responses_str, self.cur_fold)
        }

        model_dict = model_funs.get_model_dict(
            self.theta, self.cur_fold, cur_exper,
            self.conf.get(self.pipeline_task, "responses"), self.seed,
            basenames, self.conf.get(self.pipeline_task, "model_data_schema"))

        # if successful, log it
        output_path = "%s/models/model%s_%s.log" % (
            self.logging_path, self.pipeline_task, model_dict["string"])

        return luigi.LocalTarget(output_path)
Ejemplo n.º 2
0
def get_model_dict(theta,
                   cur_fold,
                   cur_exper,
                   responses,
                   seed,
                   data_path=None,
                   model_data_schema=None):
    """
    Return a string representing a model

    :param string model_name The name of the model type, as in the keys to
     MODELS_MAPPING.
    :param string theta A string of the dictionary encoding the parameter values
     for the current model run. This comes from a single entry of calling
     ParameterGrid() on a value in the models.json.
    :param int cur_fold The fold for the current training data.
    :param luigi.configuration conf The luigi configuration information.
    :return A dictionary giving specification of the model.
    :rtype dict
    """
    # load json object
    features_path = cur_exper["features"]
    preprocessing_path = cur_exper["preprocessing"]
    with open(features_path) as json_file:
        features = json.load(json_file)

    params_string = pg_sed.strip_punct(str(theta))
    model_dict = OrderedDict([("model", cur_exper["model"]["name"]),
                              ("responses", responses),
                              ("features", cur_exper["features"]),
                              ("subset", cur_exper["subset_name"]),
                              ("params", params_string),
                              ("preprocessing", cur_exper["preprocessing"]),
                              ("fold", str(cur_fold)), ("seed", seed)])

    # return the results
    model_string = pg_sed.hash_if_needed("".join(model_dict.values()))
    return {
        "string": model_string,
        "response": responses,
        "features_path": features_path,
        "features": features,
        "subset": cur_exper["subset_name"],
        "model": cur_exper["model"]["name"],
        "param": theta,
        "preprocessing": preprocessing_path,
        "fold": cur_fold,
        "seed": seed,
        "model_data_schema": model_data_schema,
        "model_data": data_path
    }
Ejemplo n.º 3
0
    def output(self):
        """
        Write responses obtained by leaving out a fold at a time
        """
        responses_basename = self.conf.get(self.pipeline_task,
                                           "responses_basename")
        responses_str = responses_basename + \
                        pg_sed.hash_if_needed(self.subset_table)
        base_path = pg_sed.process_filter_name(self.responses_dir,
                                               self.filter_condition,
                                               responses_str)
        responses_path = base_path + ".csv"
        schema_path = base_path + "_schema.csv"

        return [
            luigi.LocalTarget(responses_path),
            luigi.LocalTarget(schema_path)
        ]
Ejemplo n.º 4
0
    def output(self):
        features_basename = self.conf.get(self.pipeline_task,
                                          "features_basename")
        features_dict = json.loads(self.features_dict,
                                   object_pairs_hook=OrderedDict)

        features_str = features_basename + \
                       pg_sed.hash_if_needed("".join(features_dict.values()))
        base_path = pg_sed.process_filter_name(self.features_dir,
                                               self.filter_condition,
                                               features_str)

        features_path = base_path + ".csv"
        schema_path = base_path + "_schema.csv"
        return [
            luigi.LocalTarget(features_path),
            luigi.LocalTarget(schema_path)
        ]
Ejemplo n.º 5
0
    def requires(self):
        """
        Use the filter_condition parameter to select all folds except for the
        held out one
        """
        models_data_table = self.conf.get(self.pipeline_task, "model_data_schema")

        if self.table_type == "features":
            table_basename = self.conf.get(self.pipeline_task, "features_basename")
        elif self.table_type == "responses":
            table_basename = self.conf.get(self.pipeline_task, "responses_basename")
        else:
            raise ValueError("""'table_type' parameter must be either features or
            responses.""")

        loaded_dict = json.loads(self.data_dict, object_pairs_hook=OrderedDict)
        table_str = table_basename + \
                    pg_sed.hash_if_needed("".join(loaded_dict.values()))

        # create train data
        filter_condition = "WHERE cv_index <> %s" % self.cur_fold
        train_table = "%s.%s_train_%s" % tuple(
            [models_data_table, table_str, self.cur_fold]
        )

        train_task = LoadData(pipeline_task=self.pipeline_task,
                              table=train_table,
                              table_type=self.table_type,
                              filter_condition=filter_condition,
                              data_dict=self.data_dict)

        # create test data
        filter_condition = "WHERE cv_index = %s" % self.cur_fold
        test_table = "%s.%s_test_%s" % tuple(
            [models_data_table, table_str, self.cur_fold]
        )

        test_task = LoadData(pipeline_task=self.pipeline_task,
                             table=test_table,
                             table_type=self.table_type,
                             filter_condition=filter_condition,
                             data_dict=self.data_dict)
        return [train_task, test_task]
Ejemplo n.º 6
0
def write_responses(responses,
                    schema_name,
                    subset_type,
                    filter_condition=None,
                    responses_dir="./",
                    responses_basename="responses"):
    """
    Wrapper function to get both original and derived features

    This wraps the get_original_features() function, along with any get_*
    function used to get specific derived features specified in the features
    json.

    :param string responses A list of response types to extract. Each type
     of response must correspond to a function in this module.
    :param string subset_type The table in the semantic schema to use in
     generating features. This is specified by the subset_type field in the
     luigi.cfg file, usually.
    :param string filter_condition A condition specifying the subset of rows to
     filter down, within the specified table. This is useful when combined
     with the cv_index column for cross-validation.
    :param responses_dir string The path to which to write the leave-fold-out
     response csv files along with the schema
    :param responses_basename string The basename of the files to which we write
     the leave-fold-out response csv files along with the schema
    :return None
    :side-effects Writes responses leaving out each fold responses_dir, along
     with the schema.
    """
    responses = get_responses(responses, schema_name, subset_type,
                              filter_condition)

    # write repsonses to file
    responses_str = responses_basename + pg_sed.hash_if_needed(subset_type)
    base_path = pg_sed.process_filter_name(responses_dir, filter_condition,
                                           responses_str)
    responses_path = base_path + ".csv"
    schema_path = base_path + "_schema.csv"

    logger.info("Writing responses to %s", responses_path)
    pg_sed.write_data_with_schema(responses, responses_path, schema_path)
Ejemplo n.º 7
0
    def run(self):
        semantic = self.conf.get(self.pipeline_task, "semantic_schema")
        features_basename = self.conf.get(self.pipeline_task,
                                          "features_basename")
        features_dict = json.loads(self.features_dict,
                                   object_pairs_hook=OrderedDict)

        # get the appropriate aggregation level
        grouping_cols = self.conf.get(self.pipeline_task, "grouping_cols")
        grouping_cols = pg_sed.parse_cfg_string(grouping_cols)
        features = ft.get_features(features_dict["features"], semantic,
                                   features_dict["subset_name"],
                                   self.filter_condition, grouping_cols)

        processed_features = ft.preprocess_features(
            features, features_dict["preprocessing"])

        features_str = features_basename + \
                       pg_sed.hash_if_needed("".join(features_dict.values()))
        pg_sed.write_data_with_schema_wrapper(processed_features,
                                              self.features_dir,
                                              self.filter_condition,
                                              features_str)
Ejemplo n.º 8
0
    def run(self):
        """
        Run, evaluate, and load a model
        """
        experiments_path = self.conf.get(self.pipeline_task, "experiments")
        with open(experiments_path) as experiments_file:
            experiments_dict = json.load(experiments_file)

        cur_exper = pg_sed.fill_experiment_defaults(
            experiments_dict[self.exper_id], self.conf, self.pipeline_task)

        features_dict = OrderedDict(
            (("subset_name",
              cur_exper["subset_name"]), ("features", cur_exper["features"]),
             ("preprocessing", cur_exper["preprocessing"])))

        features_basename = self.conf.get(self.pipeline_task,
                                          "features_basename")
        features_str = features_basename + \
                       pg_sed.hash_if_needed("".join(features_dict.values()))

        responses_basename = self.conf.get(self.pipeline_task,
                                           "responses_basename")
        responses_str = responses_basename + \
                        pg_sed.hash_if_needed(features_dict["subset_name"])

        basenames = {
            "train_features": "%s_train_%s" % (features_str, self.cur_fold),
            "train_responses": "%s_train_%s" % (responses_str, self.cur_fold),
            "test_features": "%s_test_%s" % (features_str, self.cur_fold),
            "test_responses": "%s_test_%s" % (responses_str, self.cur_fold)
        }

        # get model data
        data = model_funs.get_model_data(
            basenames, self.conf.get(self.pipeline_task, "model_data_schema"))

        model_dict = model_funs.get_model_dict(
            self.theta, self.cur_fold, cur_exper,
            self.conf.get(self.pipeline_task, "responses"), self.seed,
            basenames, self.conf.get(self.pipeline_task, "model_data_schema"))

        # fit the model
        start = time.time()
        model_fit = model_funs.fit_model(cur_exper["model"]["name"],
                                         ast.literal_eval(self.theta),
                                         data["train_features"],
                                         data["train_responses"])
        model_dict["run_date"] = time.strftime('%Y-%m-%d %H:%M:%S',
                                               time.localtime(start))
        model_dict["time_to_run"] = time.time() - start

        # save model
        model_dict["binary_path"] = os.path.join(
            self.binaries_path, "%s.pkl" % model_dict["string"])
        with open(model_dict["binary_path"], "wb") as file_obj:
            pickle.dump(model_fit, file_obj)

        # evaluate the model
        metrics_list = pg_sed.parse_cfg_string(cur_exper["metrics"])
        model_eval = model_funs.evaluate_model(model_fit, data, metrics_list)

        # load model
        model_funs.load_model_results(
            model_eval, model_dict, self.models_schema,
            self.conf.get(self.pipeline_task, "models_table"))

        # if successful, log it
        output_path = os.path.join(
            self.logging_path, "models",
            "model%s_%s.log" % (self.pipeline_task, model_dict["string"]))
        open(output_path, "a").close()