コード例 #1
0
ファイル: unfold.py プロジェクト: usc-isi-i2/dsbox-cleaning
    def _split_aggregated(self, df: container.DataFrame,
                          split_col_names: list) -> container.DataFrame:
        lengths = [len(df.loc[0, col_name]) for col_name in split_col_names]

        for idx, col_name in enumerate(split_col_names):
            if self._sorted_pipe_ids:
                if len(self._sorted_pipe_ids) == lengths[idx]:
                    extend_col_names = [
                        "{}_{}".format(col_name, i)
                        for i in self._sorted_pipe_ids
                    ]
                else:
                    raise ValueError(
                        "Unique number of pipeline ids not equal to the number of aggregated values"
                    )
            else:
                extend_col_names = [
                    "{}_{}".format(col_name, i) for i in range(lengths[idx])
                ]

            extends = container.DataFrame(df.loc[:, col_name].values.tolist(),
                                          columns=extend_col_names)

            df = common_utils.horizontal_concat(left=df, right=extends)
            origin_metadata = dict(
                df.metadata.query(
                    (mbase.ALL_ELEMENTS, df.columns.get_loc(col_name))))

            for name in extend_col_names:
                col_idx = df.columns.get_loc(name)
                origin_metadata["name"] = name
                df.metadata = df.metadata.update((mbase.ALL_ELEMENTS, col_idx),
                                                 origin_metadata)

        return df
コード例 #2
0
ファイル: unfold.py プロジェクト: usc-isi-i2/dsbox-cleaning
 def _update_metadata_dimension(
         df: container.DataFrame) -> container.DataFrame:
     old_metadata = dict(df.metadata.query(()))
     old_metadata["dimension"] = dict(old_metadata["dimension"])
     old_metadata["dimension"]["length"] = df.shape[0]
     df.metadata = df.metadata.update((), old_metadata)
     return df
コード例 #3
0
    def _update_type_info(self, semantic_types: Sequence[str],
                          outputs: container.DataFrame,
                          i: int) -> container.DataFrame:
        # update the structural / df type from the semantic type
        if "http://schema.org/Integer" in semantic_types:
            outputs.metadata = outputs.metadata.update_column(
                i, {"structural_type": int})
            outputs.iloc[:, i] = pd.to_numeric(outputs.iloc[:, i])
        elif "http://schema.org/Float" in semantic_types:
            outputs.metadata = outputs.metadata.update_column(
                i, {"structural_type": float})
            outputs.iloc[:, i] = pd.to_numeric(outputs.iloc[:, i])
        elif "http://schema.org/Boolean" in semantic_types:
            outputs.metadata = outputs.metadata.update_column(
                i, {"structural_type": bool})
            outputs.iloc[:, i] = outputs.iloc[:, i].astype("bool")

        return outputs
コード例 #4
0
    def produce(
        self,
        *,
        inputs: container.DataFrame,
        timeout: float = None,
        iterations: int = None,
    ) -> base.CallResult[container.DataFrame]:

        start = time.time()
        logger.debug(f"Producing {__name__}")

        cols = self._get_columns(inputs.metadata)
        # outputs = container.DataFrame(generate_metadata=False)
        outputs = [None] * inputs.shape[1]

        parsing_semantics = self.hyperparams["parsing_semantics"]

        def fromstring(x: str) -> np.ndarray:
            # if column isn't a string, we'll just pass it through assuming it doesn't need to be parsed
            if type(x) is not str:
                return x

            return np.fromstring(x, dtype=float, sep=",")

        for col_index in range(len(inputs.columns)):
            if col_index in cols:
                column_metadata = inputs.metadata.query(
                    (metadata_base.ALL_ELEMENTS, col_index)
                )
                semantic_types = column_metadata.get("semantic_types", [])
                desired_semantics = set(semantic_types).intersection(parsing_semantics)
                if desired_semantics:
                    if (
                        "https://metadata.datadrivendiscovery.org/types/FloatVector"
                        in desired_semantics
                    ):
                        outputs[col_index] = inputs.iloc[:, col_index].apply(
                            fromstring, convert_dtype=False
                        )
                        if outputs[col_index].shape[0] > 0:
                            inputs.metadata = inputs.metadata.update_column(
                                col_index,
                                {"structural_type": type(outputs[col_index][0])},
                            )
                    elif "http://schema.org/DateTime" in desired_semantics:
                        outputs[col_index] = inputs.iloc[:, col_index].apply(
                            utils.parse_datetime_to_float,
                            fuzzy=self.hyperparams["fuzzy_time_parsing"],
                            convert_dtype=False,
                        )
                        inputs.metadata = inputs.metadata.update_column(
                            col_index, {"structural_type": float}
                        )
                    elif (
                        "https://metadata.datadrivendiscovery.org/types/CategoricalData"
                        in desired_semantics
                    ):
                        # need to make sure if a categorical type is a numeric string, convert it
                        if inputs[inputs.columns[col_index]][0].isnumeric():
                            outputs[col_index] = pd.to_numeric(
                                inputs.iloc[:, col_index],
                                errors=self.hyperparams["error_handling"],
                            )
                            if outputs[col_index].shape[0] > 0:
                                updated_type = type(outputs[col_index][0].item())
                                inputs.metadata = inputs.metadata.update_column(
                                    col_index, {"structural_type": updated_type}
                                )
                        else:
                            # if it's categorical but not numerical, ensure the string stays
                            outputs[col_index] = inputs.iloc[:, col_index]
                    else:
                        outputs[col_index] = pd.to_numeric(
                            inputs.iloc[:, col_index],
                            errors=self.hyperparams["error_handling"],
                        )
                        # Update structural type to reflect the results of the to_numeric call.  We can't rely on the semantic type because
                        # error coersion may result in a type becoming a float due to the presence of NaN.
                        if outputs[col_index].shape[0] > 0:
                            updated_type = type(outputs[col_index][0].item())
                            inputs.metadata = inputs.metadata.update_column(
                                col_index, {"structural_type": updated_type}
                            )
                else:
                    # columns without specified semantics need to be concatenated
                    outputs[col_index] = inputs.iloc[:, col_index]
            else:
                # columns not specified still need to be concatenated
                outputs[col_index] = inputs.iloc[:, col_index]

        outputs = container.DataFrame(pd.concat(outputs, axis=1))
        outputs.metadata = inputs.metadata
        end = time.time()
        logger.debug(f"Produce {__name__} completed in {end - start} ms")

        return base.CallResult(outputs)
コード例 #5
0
    def produce(
            self,
            *,
            inputs: container.DataFrame,
            timeout: float = None,
            iterations: int = None) -> base.CallResult[container.DataFrame]:

        cols = ["idx", "name", "rank"]

        # Make sure the target column is of a valid type and return no ranked features if it isn't.
        target_idx = self.hyperparams["target_col_index"]
        if not self._can_use_column(inputs.metadata, target_idx):
            return base.CallResult(container.DataFrame(data={}, columns=cols))

        # check if target is discrete or continuous
        semantic_types = inputs.metadata.query_column(
            target_idx)["semantic_types"]
        discrete = len(set(semantic_types).intersection(
            self._discrete_types)) > 0

        # make a copy of the inputs and clean out any missing data
        feature_df = inputs.copy()
        if self.hyperparams["sub_sample"]:
            sub_sample_size = (self.hyperparams["sub_sample_size"]
                               if self.hyperparams["sub_sample_size"] <
                               inputs.shape[0] else inputs.shape[0])
            rows = random.sample_without_replacement(inputs.shape[0],
                                                     sub_sample_size)
            feature_df = feature_df.iloc[rows, :]
        # makes sure that if an entire column is NA, we remove that column, so as to not remove ALL rows
        cols_to_drop = feature_df.columns[feature_df.isna().sum() ==
                                          feature_df.shape[0]]
        feature_df.drop(columns=cols_to_drop, inplace=True)
        feature_df.dropna(inplace=True)

        # split out the target feature
        target_df = feature_df.iloc[:,
                                    feature_df.columns.
                                    get_loc(inputs.columns[target_idx])]

        # drop features that are not compatible with ranking
        feature_indices = set(
            inputs.metadata.list_columns_with_semantic_types(
                self._semantic_types))
        role_indices = set(
            inputs.metadata.list_columns_with_semantic_types(self._roles))
        feature_indices = feature_indices.intersection(role_indices)
        feature_indices.remove(target_idx)
        for categ_ind in inputs.metadata.list_columns_with_semantic_types(
            ("https://metadata.datadrivendiscovery.org/types/CategoricalData",
             )):
            if categ_ind in feature_indices:
                if (np.unique(inputs[inputs.columns[categ_ind]]).shape[0] ==
                        inputs.shape[0]):
                    feature_indices.remove(categ_ind)
                elif (inputs.metadata.query(
                    (metadata_base.ALL_ELEMENTS,
                     categ_ind))["structural_type"] == str):
                    feature_df[inputs.columns[categ_ind]] = pd.to_numeric(
                        feature_df[inputs.columns[categ_ind]])
        text_indices = inputs.metadata.list_columns_with_semantic_types(
            self._text_semantic)

        tfv = TfidfVectorizer(max_features=20)
        column_to_text_features = {}
        text_feature_indices = []
        for text_index in text_indices:
            if (text_index not in feature_indices
                    and text_index in role_indices
                    and text_index != target_idx):
                word_features = tfv.fit_transform(
                    feature_df[inputs.columns[text_index]])
                if issparse(word_features):
                    column_to_text_features[inputs.columns[
                        text_index]] = pd.DataFrame.sparse.from_spmatrix(
                            word_features)
                else:
                    column_to_text_features[
                        inputs.columns[text_index]] = word_features
                text_feature_indices.append(text_index)
        text_feature_indices = set(text_feature_indices)

        # return an empty result if all features were incompatible
        numeric_features = len(feature_indices) > 0
        if not numeric_features and len(column_to_text_features) == 0:
            return base.CallResult(container.DataFrame(data={}, columns=cols))

        all_indices = set(range(0, inputs.shape[1]))
        skipped_indices = all_indices.difference(
            feature_indices.union(text_feature_indices))
        # remove columns that were dropped
        feature_indices = feature_indices - set(
            [inputs.columns.get_loc(c) for c in cols_to_drop])
        for i, v in enumerate(skipped_indices):
            feature_df.drop(inputs.columns[v], axis=1, inplace=True)

        # figure out the discrete and continuous feature indices and create an array
        # that flags them
        feature_columns = inputs.columns[list(feature_indices)]
        numeric_data = feature_df[feature_columns]
        discrete_indices = inputs.metadata.list_columns_with_semantic_types(
            self._discrete_types)
        discrete_flags = [False] * numeric_data.shape[1]
        for v in discrete_indices:
            col_name = inputs.columns[v]
            if col_name in numeric_data:
                # only mark columns with a least 1 duplicate value as discrete when predicting
                # a continuous target - there's a check in the bowels of MI code that will throw
                # an exception otherwise
                if numeric_data[col_name].duplicated().any() and not discrete:
                    col_idx = numeric_data.columns.get_loc(col_name)
                    discrete_flags[col_idx] = True

        target_np = target_df.values

        # compute mutual information for discrete or continuous target
        ranked_features_np = np.empty([0])
        text_ranked_features_np = np.empty((len(column_to_text_features), ))
        if discrete:
            if numeric_features:
                ranked_features_np = mutual_info_classif(
                    numeric_data.values,
                    target_np,
                    discrete_features=discrete_flags,
                    n_neighbors=self.hyperparams["k"],
                    random_state=self._random_seed,
                )
            for i, column in enumerate(column_to_text_features):
                text_rankings = mutual_info_classif(
                    column_to_text_features[column],
                    target_np,
                    discrete_features=[False] *
                    column_to_text_features[column].shape[1],
                    n_neighbors=self.hyperparams["k"],
                    random_state=self._random_seed,
                )
                sum_text_rank = np.sum(text_rankings)
                text_ranked_features_np[i] = sum_text_rank
        else:
            if numeric_features:
                ranked_features_np = mutual_info_regression(
                    numeric_data.values,
                    target_np,
                    discrete_features=discrete_flags,
                    n_neighbors=self.hyperparams["k"],
                    random_state=self._random_seed,
                )
            for i, column in enumerate(column_to_text_features):
                text_rankings = mutual_info_regression(
                    column_to_text_features[column],
                    target_np,
                    discrete_features=[False] *
                    column_to_text_features[column].shape[1],
                    n_neighbors=self.hyperparams["k"],
                    random_state=self._random_seed,
                )
                sum_text_rank = np.sum(text_rankings)
                text_ranked_features_np[i] = sum_text_rank

        ranked_features_np, target_entropy = self._normalize(
            ranked_features_np,
            feature_df[feature_columns],
            target_np,
            discrete,
            discrete_flags,
        )
        text_ranked_features_np = self._normalize_text(
            text_ranked_features_np, column_to_text_features, target_entropy)

        if self.hyperparams["return_as_metadata"]:
            ranked_features_np = np.append(ranked_features_np,
                                           text_ranked_features_np)
            for i, f in enumerate(feature_indices.union(text_feature_indices)):
                column_metadata = inputs.metadata.query(
                    (metadata_base.ALL_ELEMENTS, f))
                rank_dict = dict(column_metadata)
                rank_dict["rank"] = ranked_features_np[i]
                inputs.metadata = inputs.metadata.update(
                    (metadata_base.ALL_ELEMENTS, f),
                    FrozenOrderedDict(rank_dict.items()),
                )
            return base.CallResult(inputs)

        # merge back into a single list of col idx / rank value tuples
        data: typing.List[typing.Tuple[int, str, float]] = []
        data = self._append_rank_info(inputs, data, ranked_features_np,
                                      feature_df[feature_columns])
        data = self._append_rank_info(
            inputs,
            data,
            text_ranked_features_np,
            feature_df[inputs.columns[list(text_feature_indices)]],
        )

        # wrap as a D3M container - metadata should be auto generated
        results = container.DataFrame(data=data,
                                      columns=cols,
                                      generate_metadata=True)
        results = results.sort_values(by=["rank"],
                                      ascending=False).reset_index(drop=True)
        return base.CallResult(results)