Esempio n. 1
0
    def run(self, *args, **kwargs):
        """ Creates a pandas dataframe from the csv source """
        try:
            url = self.get_attribute("source.url")
            if not url:
                raise PluginError("URL of csv file cannot be empty.", plugin=self)

            # source schema is part of the source definition?
            schema = self.get_attribute("source.schema")

            # no schema was provided but the url is that of an analitico dataset in the cloud
            if not schema and url.startswith("analitico://") and url.endswith("/data/csv"):
                info_url = url.replace("/data/csv", "/data/info")
                info = self.factory.get_url_json(info_url)
                schema = get_dict_dot(info, "data.schema")

            # array of types for each column in the source
            columns = schema.get("columns") if schema else None

            dtype = None
            if columns:
                dtype = {}
                for column in columns:
                    if "type" in column:  # type is optionally defined
                        if column["type"] == "datetime":
                            dtype[column["name"]] = "object"
                        elif column["type"] == "timespan":
                            dtype[column["name"]] = "object"
                        else:
                            dtype[column["name"]] = analitico_to_pandas_type(column["type"])

            stream = self.factory.get_url_stream(url, binary=False)
            df = pandas.read_csv(stream, dtype=dtype, encoding="utf-8", na_values=NA_VALUES)

            tail = self.get_attribute("tail", 0)
            if tail > 0:
                rows_before = len(df)
                df = df.tail(tail)
                self.info("tail: %d, rows before: %d, rows after: %d", tail, rows_before, len(df))

            if schema:
                # reorder, filter, apply types, rename columns as requested in schema
                df = apply_schema(df, schema)

            return df
        except Exception as exc:
            self.exception("Error while processing: %s", url, exception=exc)
Esempio n. 2
0
    def test_dataset_csv4_applyschema_rename(self):
        """ Test reading a table then renaming a column """
        try:
            df = self.read_dataframe_asset("ds_test_4.json")
            schema = generate_schema(df)

            columns = schema["columns"]
            self.assertEqual(len(columns), 3)
            self.assertEqual(df.columns[1], "Second")

            schema["columns"][1]["rename"] = "Secondo"
            df = apply_schema(df, schema)

            columns = df.columns
            self.assertEqual(df.columns[1], "Secondo")
        except Exception as exc:
            raise exc
Esempio n. 3
0
    def test_dataset_csv4_applyschema_index(self):
        """ Test reading a table then making a column its index """
        try:
            df = self.read_dataframe_asset("ds_test_4.json")
            schema = generate_schema(df)

            columns = schema["columns"]
            self.assertEqual(len(columns), 3)
            self.assertEqual(df.index.name, None)

            schema["columns"][0]["index"] = True
            df = apply_schema(df, schema)

            columns = df.columns
            self.assertEqual(df.index.name, "First")
        except Exception as exc:
            raise exc
Esempio n. 4
0
    def run(self, *args, action=None, **kwargs) -> pd.DataFrame:

        df = args[0]
        if not isinstance(df, pd.DataFrame):
            self.warning(
                "TransformDataframePlugin - requires a single pd.DataFrame as input, none was found"
            )
            return args

        schema = self.get_attribute("schema", None)
        if not schema:
            self.warning(
                "TransformDataframePlugin - should have a 'schema' attribute with the schema that you want to apply to the input dataframe"
            )
            return df

        df = apply_schema(df, schema)
        self.info("TransformDataframePlugin - schema applied to dataframe")

        return df
Esempio n. 5
0
    def _run_predict(self, *args, **kwargs):
        """ 
        When an algorithm runs it always takes in a dataframe with training data,
        it may optionally have a dataframe of validation data and will return a dictionary
        with information on the trained model plus a number of artifacts.
        """
        # assert isinstance(args[0], pandas.DataFrame) # custom models may take json as input
        data = args[0]

        artifacts_path = self.factory.get_artifacts_directory()
        training = read_json(os.path.join(artifacts_path, "metadata.json"))
        assert training

        started_on = time_ms()
        results = collections.OrderedDict({
            "type": "analitico/prediction",
            # "endpoint_id": None,
            # "model_id": None,
            # "job_id": None,
            # "records": None,  # processed (augmented) data will be added by IAlgorithm
            # "predictions": None,  # predictions
            # "probabilities": None,
            "performance":
            get_runtime_brief(),  # time elapsed, cpu, gpu, memory, disk, etc
        })

        # force schema like in training data
        if isinstance(data, pd.DataFrame):
            schema = training["data"]["schema"]
            data = apply_schema(data, schema)

        # load model, calculate predictions
        results = self.predict(data, training, results, *args, **kwargs)
        results["performance"]["total_ms"] = time_ms(started_on)

        results_path = os.path.join(artifacts_path, "results.json")
        save_json(results, results_path)

        return results