def _join_string_col(cls, left_df: container.DataFrame, left_col: str, right_df: container.DataFrame, right_col: str, accuracy: float) -> pd.DataFrame: # use d3mIndex from left col if present right_df = right_df.drop(columns='d3mIndex') # pre-compute fuzzy matches left_keys = left_df[left_col].unique() right_keys = right_df[right_col].unique() matches: typing.Dict[str, typing.Optional[str]] = {} for left_key in left_keys: matches[left_key] = cls._string_fuzzy_match( left_key, right_keys, accuracy * 100) # look up pre-computed fuzzy match for each element in the left column left_df.index = left_df[left_col].map(lambda key: matches[key]) # make the right col the right dataframe index right_df = right_df.set_index(right_col) # inner join on the left / right indices joined = container.DataFrame( left_df.join(right_df, lsuffix='_1', rsuffix='_2', how='inner')) # sort on the d3m index if there, otherwise use the joined column if 'd3mIndex' in joined: joined = joined.sort_values(by=['d3mIndex']) else: joined = joined.sort_values(by=[left_col]) joined = joined.reset_index(drop=True) return joined
def _join_numeric_col(cls, left_df: container.DataFrame, left_col: str, right_df: container.DataFrame, right_col: str, accuracy: float) -> pd.DataFrame: # use d3mIndex from left col if present right_df = right_df.drop(columns='d3mIndex') # fuzzy match each of the left join col against the right join col value and save the results as the left # dataframe index right_df[right_col] = pd.to_numeric(right_df[right_col]) choices = right_df[right_col].unique() left_df[left_col] = pd.to_numeric(left_df[left_col]) left_df.index = left_df[left_col]. \ map(lambda x: cls._numeric_fuzzy_match(x, choices, accuracy)) # make the right col the right dataframe index right_df = right_df.set_index(right_col) # inner join on the left / right indices joined = container.DataFrame( left_df.join(right_df, lsuffix='_1', rsuffix='_2', how='inner')) # sort on the d3m index if there, otherwise use the joined column if 'd3mIndex' in joined: joined = joined.sort_values(by=['d3mIndex']) else: joined = joined.sort_values(by=[left_col]) joined = joined.reset_index(drop=True) return joined
def _join_datetime_col(cls, left_df: container.DataFrame, left_col: str, right_df: container.DataFrame, right_col: str, accuracy: float) -> pd.DataFrame: # use d3mIndex from left col if present right_df = right_df.drop(columns='d3mIndex') # compute a tolerance delta for time matching based on a percentage of the minimum left/right time # range choices = np.array([np.datetime64(parser.parse(dt)) for dt in right_df[right_col].unique()]) left_keys = np.array([np.datetime64(parser.parse(dt)) for dt in left_df[left_col].values]) time_tolerance = (1.0 - accuracy) * cls._compute_time_range(left_keys, choices) left_df.index = np.array([cls._datetime_fuzzy_match(dt, choices, time_tolerance) for dt in left_keys]) # make the right col the right dataframe index right_df = right_df.set_index(right_col) # inner join on the left / right indices joined = container.DataFrame(left_df.join(right_df, lsuffix='_1', rsuffix='_2', how='inner')) # sort on the d3m index if there, otherwise use the joined column if 'd3mIndex' in joined: joined = joined.sort_values(by=['d3mIndex']) else: joined = joined.sort_values(by=[left_col]) joined = joined.reset_index(drop=True) return joined