Exemple #1
0
    def _join_string_col(cls, left_df: container.DataFrame, left_col: str,
                         right_df: container.DataFrame, right_col: str,
                         accuracy: float) -> pd.DataFrame:
        # use d3mIndex from left col if present
        right_df = right_df.drop(columns='d3mIndex')

        # pre-compute fuzzy matches
        left_keys = left_df[left_col].unique()
        right_keys = right_df[right_col].unique()
        matches: typing.Dict[str, typing.Optional[str]] = {}
        for left_key in left_keys:
            matches[left_key] = cls._string_fuzzy_match(
                left_key, right_keys, accuracy * 100)

        # look up pre-computed fuzzy match for each element in the left column
        left_df.index = left_df[left_col].map(lambda key: matches[key])

        # make the right col the right dataframe index
        right_df = right_df.set_index(right_col)

        # inner join on the left / right indices
        joined = container.DataFrame(
            left_df.join(right_df, lsuffix='_1', rsuffix='_2', how='inner'))

        # sort on the d3m index if there, otherwise use the joined column
        if 'd3mIndex' in joined:
            joined = joined.sort_values(by=['d3mIndex'])
        else:
            joined = joined.sort_values(by=[left_col])
        joined = joined.reset_index(drop=True)

        return joined
Exemple #2
0
    def _join_numeric_col(cls, left_df: container.DataFrame, left_col: str,
                          right_df: container.DataFrame, right_col: str,
                          accuracy: float) -> pd.DataFrame:
        # use d3mIndex from left col if present
        right_df = right_df.drop(columns='d3mIndex')

        # fuzzy match each of the left join col against the right join col value and save the results as the left
        # dataframe index
        right_df[right_col] = pd.to_numeric(right_df[right_col])
        choices = right_df[right_col].unique()
        left_df[left_col] = pd.to_numeric(left_df[left_col])
        left_df.index = left_df[left_col]. \
            map(lambda x: cls._numeric_fuzzy_match(x, choices, accuracy))

        # make the right col the right dataframe index
        right_df = right_df.set_index(right_col)

        # inner join on the left / right indices
        joined = container.DataFrame(
            left_df.join(right_df, lsuffix='_1', rsuffix='_2', how='inner'))

        # sort on the d3m index if there, otherwise use the joined column
        if 'd3mIndex' in joined:
            joined = joined.sort_values(by=['d3mIndex'])
        else:
            joined = joined.sort_values(by=[left_col])
        joined = joined.reset_index(drop=True)

        return joined
Exemple #3
0
    def _join_datetime_col(cls,
                           left_df: container.DataFrame,
                           left_col: str,
                           right_df: container.DataFrame,
                           right_col: str,
                           accuracy: float) -> pd.DataFrame:
        # use d3mIndex from left col if present
        right_df = right_df.drop(columns='d3mIndex')

        # compute a tolerance delta for time matching based on a percentage of the minimum left/right time
        # range
        choices = np.array([np.datetime64(parser.parse(dt)) for dt in right_df[right_col].unique()])
        left_keys = np.array([np.datetime64(parser.parse(dt)) for dt in left_df[left_col].values])
        time_tolerance = (1.0 - accuracy) * cls._compute_time_range(left_keys, choices)
        
        left_df.index = np.array([cls._datetime_fuzzy_match(dt, choices, time_tolerance) for dt in left_keys])

        # make the right col the right dataframe index
        right_df = right_df.set_index(right_col)

        # inner join on the left / right indices
        joined = container.DataFrame(left_df.join(right_df, lsuffix='_1', rsuffix='_2', how='inner'))

        # sort on the d3m index if there, otherwise use the joined column
        if 'd3mIndex' in joined:
            joined = joined.sort_values(by=['d3mIndex'])
        else:
            joined = joined.sort_values(by=[left_col])
        joined = joined.reset_index(drop=True)

        return joined
Exemple #4
0
    def _produce(
        self,
        *,
        left_df_full: container.DataFrame, # type: ignore
        left_df: container.DataFrame,  # type: ignore
        right_df: container.DataFrame,  # type: ignore
        join_types: typing.Sequence[str],
        left_col: typing.Sequence[int],
        right_col: typing.Sequence[int],
        accuracy: typing.Sequence[float],
        absolute_accuracy: typing.Sequence[bool]
    ) -> base.CallResult[Outputs]:

        # cycle through the columns to join the dataframes
        right_cols_to_drop = []
        new_left_cols = []
        new_right_cols = []
        for col_index in range(len(left_col)):
            # depending on the joining type, make a new dataframe that has columns we will want to merge on
            # keep track of which columns we will want to drop later on
            if len(self._STRING_JOIN_TYPES.intersection(join_types[col_index])) > 0:
                new_left_df = self._create_string_merge_cols(
                    left_df,
                    left_col[col_index],
                    right_df,
                    right_col[col_index],
                    accuracy[col_index],
                    col_index,
                )
                left_df[new_left_df.columns] = new_left_df
                right_name = "righty_string" + str(col_index)
                right_df.rename(
                    columns={right_col[col_index]: right_name}, inplace=True
                )
                new_left_cols += list(new_left_df.columns)
                new_right_cols.append(right_name)
            elif len(self._NUMERIC_JOIN_TYPES.intersection(join_types[col_index])) > 0:
                new_left_df = self._create_numeric_merge_cols(
                    left_df,
                    left_col[col_index],
                    right_df,
                    right_col[col_index],
                    accuracy[col_index],
                    col_index,
                    absolute_accuracy[col_index],
                )
                left_df[new_left_df.columns] = new_left_df
                right_name = "righty_numeric" + str(col_index)
                right_df.rename(
                    columns={right_col[col_index]: right_name}, inplace=True
                )
                new_left_cols += list(new_left_df.columns)
                new_right_cols.append(right_name)
            elif len(self._GEO_JOIN_TYPES.intersection(join_types[col_index])) > 0:
                new_left_df, new_right_df = self._create_geo_vector_merging_cols(
                    left_df,
                    left_col[col_index],
                    right_df,
                    right_col[col_index],
                    accuracy[col_index],
                    col_index,
                    absolute_accuracy[col_index],
                )
                left_df[new_left_df.columns] = new_left_df
                right_df[new_right_df.columns] = new_right_df
                new_left_cols += list(new_left_df.columns)
                new_right_cols += list(new_right_df.columns)
                right_cols_to_drop.append(right_col[col_index])
            elif len(self._VECTOR_JOIN_TYPES.intersection(join_types[col_index])) > 0:
                new_left_df, new_right_df = self._create_vector_merging_cols(
                    left_df,
                    left_col[col_index],
                    right_df,
                    right_col[col_index],
                    accuracy[col_index],
                    col_index,
                    absolute_accuracy[col_index],
                )
                left_df[new_left_df.columns] = new_left_df
                right_df[new_right_df.columns] = new_right_df
                new_left_cols += list(new_left_df.columns)
                new_right_cols += list(new_right_df.columns)
                right_cols_to_drop.append(right_col[col_index])
            elif len(self._DATETIME_JOIN_TYPES.intersection(join_types[col_index])) > 0:
                tolerance = self._compute_datetime_tolerance(left_df_full, left_col[col_index], right_df, right_col[col_index], accuracy[col_index])
                new_left_df, new_right_df = self._create_datetime_merge_cols(
                    left_df,
                    left_col[col_index],
                    right_df,
                    right_col[col_index],
                    tolerance,
                    col_index,
                )
                left_df[new_left_df.columns] = new_left_df
                right_df[new_right_df.columns] = new_right_df
                new_left_cols += list(new_left_df.columns)
                new_right_cols += list(new_right_df.columns)
                right_cols_to_drop.append(right_col[col_index])
            else:
                raise exceptions.InvalidArgumentValueError(
                    "join not surpported on type " + str(join_types[col_index])
                )

        if "d3mIndex" in right_df.columns:
            right_cols_to_drop.append("d3mIndex")
        right_df.drop(columns=right_cols_to_drop, inplace=True)

        joined = pd.merge(
            left_df,
            right_df,
            how=self.hyperparams["join_type"],
            left_on=new_left_cols,
            right_on=new_right_cols,
            suffixes=["_left", "_right"],
        )

        # don't want to keep columns that were created specifically for merging
        # also, inner merge keeps the right column we merge on, we want to remove it
        joined.drop(columns=new_left_cols + new_right_cols, inplace=True)

        return joined