def _join_string_col(cls, left_df: container.DataFrame, left_col: str, right_df: container.DataFrame, right_col: str, accuracy: float) -> pd.DataFrame: # use d3mIndex from left col if present right_df = right_df.drop(columns='d3mIndex') # pre-compute fuzzy matches left_keys = left_df[left_col].unique() right_keys = right_df[right_col].unique() matches: typing.Dict[str, typing.Optional[str]] = {} for left_key in left_keys: matches[left_key] = cls._string_fuzzy_match( left_key, right_keys, accuracy * 100) # look up pre-computed fuzzy match for each element in the left column left_df.index = left_df[left_col].map(lambda key: matches[key]) # make the right col the right dataframe index right_df = right_df.set_index(right_col) # inner join on the left / right indices joined = container.DataFrame( left_df.join(right_df, lsuffix='_1', rsuffix='_2', how='inner')) # sort on the d3m index if there, otherwise use the joined column if 'd3mIndex' in joined: joined = joined.sort_values(by=['d3mIndex']) else: joined = joined.sort_values(by=[left_col]) joined = joined.reset_index(drop=True) return joined
def _join_numeric_col(cls, left_df: container.DataFrame, left_col: str, right_df: container.DataFrame, right_col: str, accuracy: float) -> pd.DataFrame: # use d3mIndex from left col if present right_df = right_df.drop(columns='d3mIndex') # fuzzy match each of the left join col against the right join col value and save the results as the left # dataframe index right_df[right_col] = pd.to_numeric(right_df[right_col]) choices = right_df[right_col].unique() left_df[left_col] = pd.to_numeric(left_df[left_col]) left_df.index = left_df[left_col]. \ map(lambda x: cls._numeric_fuzzy_match(x, choices, accuracy)) # make the right col the right dataframe index right_df = right_df.set_index(right_col) # inner join on the left / right indices joined = container.DataFrame( left_df.join(right_df, lsuffix='_1', rsuffix='_2', how='inner')) # sort on the d3m index if there, otherwise use the joined column if 'd3mIndex' in joined: joined = joined.sort_values(by=['d3mIndex']) else: joined = joined.sort_values(by=[left_col]) joined = joined.reset_index(drop=True) return joined
def _join_datetime_col(cls, left_df: container.DataFrame, left_col: str, right_df: container.DataFrame, right_col: str, accuracy: float) -> pd.DataFrame: # use d3mIndex from left col if present right_df = right_df.drop(columns='d3mIndex') # compute a tolerance delta for time matching based on a percentage of the minimum left/right time # range choices = np.array([np.datetime64(parser.parse(dt)) for dt in right_df[right_col].unique()]) left_keys = np.array([np.datetime64(parser.parse(dt)) for dt in left_df[left_col].values]) time_tolerance = (1.0 - accuracy) * cls._compute_time_range(left_keys, choices) left_df.index = np.array([cls._datetime_fuzzy_match(dt, choices, time_tolerance) for dt in left_keys]) # make the right col the right dataframe index right_df = right_df.set_index(right_col) # inner join on the left / right indices joined = container.DataFrame(left_df.join(right_df, lsuffix='_1', rsuffix='_2', how='inner')) # sort on the d3m index if there, otherwise use the joined column if 'd3mIndex' in joined: joined = joined.sort_values(by=['d3mIndex']) else: joined = joined.sort_values(by=[left_col]) joined = joined.reset_index(drop=True) return joined
def _produce( self, *, left_df_full: container.DataFrame, # type: ignore left_df: container.DataFrame, # type: ignore right_df: container.DataFrame, # type: ignore join_types: typing.Sequence[str], left_col: typing.Sequence[int], right_col: typing.Sequence[int], accuracy: typing.Sequence[float], absolute_accuracy: typing.Sequence[bool] ) -> base.CallResult[Outputs]: # cycle through the columns to join the dataframes right_cols_to_drop = [] new_left_cols = [] new_right_cols = [] for col_index in range(len(left_col)): # depending on the joining type, make a new dataframe that has columns we will want to merge on # keep track of which columns we will want to drop later on if len(self._STRING_JOIN_TYPES.intersection(join_types[col_index])) > 0: new_left_df = self._create_string_merge_cols( left_df, left_col[col_index], right_df, right_col[col_index], accuracy[col_index], col_index, ) left_df[new_left_df.columns] = new_left_df right_name = "righty_string" + str(col_index) right_df.rename( columns={right_col[col_index]: right_name}, inplace=True ) new_left_cols += list(new_left_df.columns) new_right_cols.append(right_name) elif len(self._NUMERIC_JOIN_TYPES.intersection(join_types[col_index])) > 0: new_left_df = self._create_numeric_merge_cols( left_df, left_col[col_index], right_df, right_col[col_index], accuracy[col_index], col_index, absolute_accuracy[col_index], ) left_df[new_left_df.columns] = new_left_df right_name = "righty_numeric" + str(col_index) right_df.rename( columns={right_col[col_index]: right_name}, inplace=True ) new_left_cols += list(new_left_df.columns) new_right_cols.append(right_name) elif len(self._GEO_JOIN_TYPES.intersection(join_types[col_index])) > 0: new_left_df, new_right_df = self._create_geo_vector_merging_cols( left_df, left_col[col_index], right_df, right_col[col_index], accuracy[col_index], col_index, absolute_accuracy[col_index], ) left_df[new_left_df.columns] = new_left_df right_df[new_right_df.columns] = new_right_df new_left_cols += list(new_left_df.columns) new_right_cols += list(new_right_df.columns) right_cols_to_drop.append(right_col[col_index]) elif len(self._VECTOR_JOIN_TYPES.intersection(join_types[col_index])) > 0: new_left_df, new_right_df = self._create_vector_merging_cols( left_df, left_col[col_index], right_df, right_col[col_index], accuracy[col_index], col_index, absolute_accuracy[col_index], ) left_df[new_left_df.columns] = new_left_df right_df[new_right_df.columns] = new_right_df new_left_cols += list(new_left_df.columns) new_right_cols += list(new_right_df.columns) right_cols_to_drop.append(right_col[col_index]) elif len(self._DATETIME_JOIN_TYPES.intersection(join_types[col_index])) > 0: tolerance = self._compute_datetime_tolerance(left_df_full, left_col[col_index], right_df, right_col[col_index], accuracy[col_index]) new_left_df, new_right_df = self._create_datetime_merge_cols( left_df, left_col[col_index], right_df, right_col[col_index], tolerance, col_index, ) left_df[new_left_df.columns] = new_left_df right_df[new_right_df.columns] = new_right_df new_left_cols += list(new_left_df.columns) new_right_cols += list(new_right_df.columns) right_cols_to_drop.append(right_col[col_index]) else: raise exceptions.InvalidArgumentValueError( "join not surpported on type " + str(join_types[col_index]) ) if "d3mIndex" in right_df.columns: right_cols_to_drop.append("d3mIndex") right_df.drop(columns=right_cols_to_drop, inplace=True) joined = pd.merge( left_df, right_df, how=self.hyperparams["join_type"], left_on=new_left_cols, right_on=new_right_cols, suffixes=["_left", "_right"], ) # don't want to keep columns that were created specifically for merging # also, inner merge keeps the right column we merge on, we want to remove it joined.drop(columns=new_left_cols + new_right_cols, inplace=True) return joined