Esempio n. 1
0
def run_builtin_aggregators(builtin_aggregates, df, ctx, spark):
    agg_cols = []
    for r in builtin_aggregates:
        aggregator = ctx.aggregators[r["aggregator"]]
        f_name = extract_spark_name(aggregator["name"])

        agg_func = getattr(F, f_name)
        col_name_list = []
        columns_dict = r["inputs"]["columns"]

        if "col" in columns_dict.keys():
            col_name_list.append(columns_dict["col"])
        if "cols" in columns_dict.keys():
            col_name_list += columns_dict["cols"]
        if "col1" in columns_dict.keys() and "col2" in columns_dict.keys():
            col_name_list.append(columns_dict["col1"])
            col_name_list.append(columns_dict["col2"])

        if len(col_name_list) == 0:
            raise CortexException(
                "input columns not found in aggregator: {}".format(r))

        args = {}
        if r["inputs"].get("args",
                           None) is not None and len(r["inputs"]["args"]) > 0:
            args = ctx.populate_args(r["inputs"]["args"])
        col_list = [F.col(c) for c in col_name_list]
        agg_cols.append(agg_func(*col_list, **args).alias(r["name"]))

    results = df.agg(*agg_cols).collect()[0].asDict()

    for r in builtin_aggregates:
        ctx.store_aggregate_result(results[r["name"]], r)

    return results
Esempio n. 2
0
 def download_file(self, key, local_path):
     try:
         util.mkdir_p(os.path.dirname(local_path))
         self.s3.download_file(self.bucket, key, local_path)
         return local_path
     except Exception as e:
         raise CortexException("bucket " + self.bucket, "key " + key) from e
Esempio n. 3
0
def download_file_from_s3(key, local_path, bucket, client_config={}):
    try:
        util.mkdir_p(os.path.dirname(local_path))
        s3 = s3_client(client_config)
        s3.download_file(bucket, key, local_path)
        return local_path
    except Exception as e:
        raise CortexException("bucket " + bucket, "key " + key) from e
Esempio n. 4
0
    def _read_bytes_from_s3(self, key, allow_missing=False):
        try:
            byte_array = self.s3.get_object(Bucket=self.bucket,
                                            Key=key)["Body"].read()
        except self.s3.exceptions.NoSuchKey as e:
            if allow_missing:
                return None
            raise CortexException("bucket " + self.bucket, "key " + key) from e

        return byte_array.strip()
Esempio n. 5
0
 def download_file(self, key, local_path):
     util.mkdir_p(os.path.dirname(local_path))
     try:
         self.s3.download_file(self.bucket, key, local_path)
         return local_path
     except Exception as e:
         raise CortexException(
             'key "{}" in bucket "{}" could not be accessed; '.format(
                 key, bucket) +
             "it may not exist, or you may not have suffienct permissions"
         ) from e
Esempio n. 6
0
def create_inputs_map(values_map, input_config):
    inputs = {}
    for input_name, input_config_item in input_config.items():
        if util.is_str(input_config_item):
            inputs[input_name] = values_map[input_config_item]
        elif util.is_int(input_config_item):
            inputs[input_name] = values_map[input_config_item]
        elif util.is_list(input_config_item):
            inputs[input_name] = [values_map[f] for f in input_config_item]
        else:
            raise CortexException("invalid column inputs")

    return inputs
Esempio n. 7
0
    def get_training_data_parts(self, model_name, mode, part_prefix="part"):
        training_dataset = self.models[model_name]["dataset"]
        if mode == "training":
            data_key = training_dataset["train_key"]
        elif mode == "evaluation":
            data_key = training_dataset["eval_key"]
        else:
            raise CortexException(
                "unrecognized training/evaluation mode {} must be one of (train_key, eval_key)"
                .format(mode))

        training_data_parts_prefix = os.path.join(data_key, part_prefix)
        return self.storage.search(prefix=training_data_parts_prefix)
Esempio n. 8
0
def create_inputs_from_features_map(features_values_map, feature_input_config):
    inputs = {}
    for input_name, input_config_item in feature_input_config.items():
        if util.is_str(input_config_item):
            inputs[input_name] = features_values_map[input_config_item]
        elif util.is_int(input_config_item):
            inputs[input_name] = features_values_map[input_config_item]
        elif util.is_list(input_config_item):
            inputs[input_name] = [
                features_values_map[f] for f in input_config_item
            ]
        else:
            raise CortexException("invalid feature inputs")

    return inputs
Esempio n. 9
0
def _deserialize_raw_ctx(raw_ctx):
    if raw_ctx.get("environment") is not None:
        raw_columns = raw_ctx["raw_columns"]
        raw_ctx["raw_columns"] = util.merge_dicts_overwrite(*raw_columns.values())

        data_split = raw_ctx["environment_data"]

        if data_split["csv_data"] is not None and data_split["parquet_data"] is None:
            raw_ctx["environment"]["data"] = data_split["csv_data"]
        elif data_split["parquet_data"] is not None and data_split["csv_data"] is None:
            raw_ctx["environment"]["data"] = data_split["parquet_data"]
        else:
            raise CortexException("expected csv_data or parquet_data but found " + data_split)

    return raw_ctx
Esempio n. 10
0
def _deserialize_raw_ctx(raw_ctx):
    raw_columns = raw_ctx["raw_columns"]
    raw_ctx["raw_columns"] = util.merge_dicts_overwrite(
        raw_columns["raw_int_columns"],
        raw_columns["raw_float_columns"],
        raw_columns["raw_string_columns"],
    )

    data_split = raw_ctx["environment_data"]

    if data_split["csv_data"] is not None and data_split["parquet_data"] is None:
        raw_ctx["environment"]["data"] = data_split["csv_data"]
    elif data_split["parquet_data"] is not None and data_split["csv_data"] is None:
        raw_ctx["environment"]["data"] = data_split["parquet_data"]
    else:
        raise CortexException("expected csv_data or parquet_data but found " + data_split)
    return raw_ctx
Esempio n. 11
0
    def _read_bytes_from_s3(self, key, allow_missing=False, ext_bucket=None):
        bucket = self.bucket
        if ext_bucket is not None:
            bucket = ext_bucket

        try:
            try:
                byte_array = self.s3.get_object(Bucket=bucket,
                                                Key=key)["Body"].read()
            except self.s3.exceptions.NoSuchKey as e:
                if allow_missing:
                    return None
                raise e
        except Exception as e:
            raise CortexException(
                'key "{}" in bucket "{}" could not be accessed; '.format(
                    key, bucket) +
                "it may not exist, or you may not have suffienct permissions"
            ) from e

        return byte_array.strip()
Esempio n. 12
0
 def download_file(self, key, local_path):
     try:
         Path(local_path).parent.mkdir(parents=True, exist_ok=True)
         shutil.copy(str(self._get_path(key)), local_path)
     except Exception as e:
         raise CortexException("file not found", key) from e