def test_small_no_merge(cli_link_small_no_merge): expected_dict = pd.read_excel( current_dir / "small.xlsx", sheet_name=["LINK_INSTR1", "INSTR2_linked", "INSTR3_linked"]) expected_primary = expected_dict["LINK_INSTR1"] expected_secondary_instr2 = expected_dict["INSTR2_linked"] expected_secondary_instr3 = expected_dict["INSTR3_linked"] # copy file to current dir if you want to debug more if output_dir is not None: copy(cli_link_small_no_merge, current_dir) results_dict = pd.read_excel( cli_link_small_no_merge, sheet_name=["small_anchor", "instr2_all_linked", "instr3_all_linked"], ) result_primary = results_dict["small_anchor"] result_secondary_instr2 = results_dict["instr2_all_linked"] result_secondary_instr3 = results_dict["instr3_all_linked"] assert_dfs_equal( result_primary, expected_primary, cols_ignore=cols_ignore, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir, ) cols_ignore2 = [ get_option("column.system.abs_diff_days"), get_option("column.system.diff_days"), "PIDN", "VType", "_merge", ] assert_dfs_equal( result_secondary_instr2, expected_secondary_instr2, cols_ignore=cols_ignore2, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir, ) assert_dfs_equal( result_secondary_instr3, expected_secondary_instr3, cols_ignore=cols_ignore2, cols_ignore_pat=cols_ignore_pat, output_dir=output_dir, )
def __init__( self, data=None, *args, id_col_name=None, date_col_name=None, id2_col_name=None, name=None, tags=None, display_name_generator=None, id_col_dropna=False, **kwargs, ): super().__init__(data, *args, **kwargs) self.id_col_name = id_col_name self.date_col_name = date_col_name self.id2_col_name = id2_col_name self.name = name if name else get_option("dataset.default.name") self.tags = tags self.display_name_generator = (display_name_generator if display_name_generator else self.default_display_name_generator) if id_col_dropna: self.dropna(subset=["id_col_name"], inplace=True)
def add_diff_days(df: pd.DataFrame, col_start: str, col_end: str, diff_days_col: str = None, inplace=False): """Adds a column to DataFrame called ``_diff_days`` which contains the number of days between ``col_start`` and ``col_end`` :param df: DataFrame :param col_start: column containing the start date :param col_end: column containing the end date """ if diff_days_col is None: diff_days_col = get_option("column.system.diff_days") if col_start == col_end: raise KeyError( "date columns have the same name: {col_start}=={col_end}") if not inplace: df = df.copy() df[diff_days_col] = df[col_end] - df[col_start] df[diff_days_col] = df[diff_days_col] / np.timedelta64(1, "D") # df.assign(**{diff_days_col: (df[col_end] - df[col_start]) / np.timedelta64(1, "D")}) if not inplace: return df
def date_proximity( self, right_dset: "Dataset", get: str = "all", when: str = "earlier_or_later", days: int = 90, dropna: bool = False, drop_duplicates: bool = False, duplicates_indicator: bool = False, merge_suffixes=get_option("operators.binary.column_suffixes"), prepend_level_name: bool = True, ) -> None: """ Links data across this :class:`Dataset` and ``right_dset``, updating :attr:`df` with the results. Calls :func:`macpie.date_proximity`, passing in ``right_dset`` as the "left" Dataset, and this Dataset as the "right" Dataset. """ from macpie.operators.date_proximity import date_proximity return date_proximity( left=self, right=right_dset, get=get, when=when, days=days, dropna=dropna, drop_duplicates=drop_duplicates, duplicates_indicator=duplicates_indicator, merge_suffixes=merge_suffixes, prepend_level_name=prepend_level_name, )
def __init__( self, left: pd.DataFrame, right: pd.DataFrame, on=None, left_on=None, right_on=None, merge_suffixes=get_option("operators.binary.column_suffixes"), add_suffixes=False, add_indexes=(None, None), ): self.left = self.orig_left = left self.right = self.orig_right = right self.on = lltools.maybe_make_list(on) self.left_on = lltools.maybe_make_list(left_on) self.right_on = lltools.maybe_make_list(right_on) self.merge_suffixes = merge_suffixes self.add_suffixes = validatortools.validate_bool_kwarg( add_suffixes, "add_suffixes") self.add_indexes = add_indexes self._validate_specification()
def mark_duplicates_by_cols(df: pd.DataFrame, cols: List[str]): """Create a column in ``df`` called ``_duplicates`` which is a boolean Series denoting duplicate rows as identified by ``cols``. :param df: DataFrame :param cols: Only consider these columns for identifiying duplicates """ df[get_option("column.system.duplicates")] = df.duplicated(subset=cols, keep=False) return df
def primary(self, dset: Dataset): """Sets the `primary` Dataset of this collection.""" AnchoredList.primary.fset(self, dset) if self._primary is not None: if self._primary_anchor_col is None: self._primary_anchor_col = self._primary.id_col_name if self._secondary_anchor_col is None: self._secondary_anchor_col = ( self._primary.id_col_name + get_option("operators.binary.column_suffixes")[0])
def write_tablib_dataset(self, tlset: tl.Dataset, freeze_panes=True): sheet_name = (safe_xlsx_sheet_title(tlset.title, "-") if tlset.title else (get_option("excel.sheet_name.default"))) ws = self.book.add_worksheet(sheet_name, autofit_columns=True) xlsxwritertools.tlset_sheet(tlset, ws, self.format_bold, self.format_text_wrap, freeze_panes=freeze_panes)
def test_add_diff_days(): d = { "col1": ["a", "b", "c"], "col2": [datetime(2001, 3, 2), datetime(2001, 2, 1), datetime(2001, 8, 1)], "col3": [datetime(2002, 3, 2), datetime(2001, 3, 1), datetime(2001, 8, 2)], } df = pd.DataFrame(data=d) with pytest.raises(KeyError): df.mac.add_diff_days("col2", "col2") df = df.mac.add_diff_days("col2", "col3") assert get_option("column.system.diff_days") in df.columns assert df[get_option("column.system.diff_days")].equals( pd.Series([365.0, 28.0, 1.0]))
def execute(self): prim_dset = Dataset.from_file( self.primary, id_col_name=None, date_col_name=self.date_col, id2_col_name=self.id2_col, name=self.primary.stem, ) if self.id_col in prim_dset.columns: prim_dset.id_col_name = self.id_col else: click.echo( "\nWARNING: ID Column Header (-i, --id-col) not specified " 'and default of "InstrID" not found in your PRIMARY file.') click.echo( f' Creating one for you called "{NEW_ID_COL_NAME}"\n') prim_dset.create_id_col(col_name=NEW_ID_COL_NAME) collection = MergeableAnchoredList(prim_dset) if self.secondary: for sec in self.secondary: try: sec_dset = Dataset.from_file( sec, id_col_name=self.secondary_id_col, date_col_name=self.secondary_date_col, id2_col_name=self.secondary_id2_col, name=sec.stem, ) sec_dset_linked = prim_dset.date_proximity( right_dset=sec_dset, get=self.secondary_get, when=self.secondary_when, days=self.secondary_days, merge_suffixes=get_option( "operators.binary.column_suffixes"), prepend_level_name=False, ) collection.add_secondary(sec_dset_linked) except Exception as e: click.echo(f'\nERROR loading secondary dataset "{sec}"\n') click.echo(e) raise (e) self.collection = collection
def __new__(cls, *args, **kwargs): # only switch class if generic(MACPieExcelWriter) if cls is MACPieExcelWriter: engine = kwargs.pop("engine", None) if engine is None: engine = get_option("excel.writer.engine") if engine not in ("mp_xlsxwriter", "mp_openpyxl"): raise ValueError( f"Invalid engine: '{engine}''. Only 'mp_xlsxwriter' or 'mp_openpyxl' supported." ) cls = pd.io.excel._util.get_writer(engine) return object.__new__(cls)
def sys_cols(self): """Returns list of `system` column names of this :class:`Dataset`, defined as any columns starting with ``column.system.prefix`` option. """ sys_col_prefix = get_option("column.system.prefix") if isinstance(self.columns, pd.MultiIndex): return [ col for col in self.columns if col[-1].startswith(sys_col_prefix) ] else: return [ col for col in self.columns if col.startswith(sys_col_prefix) ]
def test_keep_earliest_csv(): # test earliest df = file_to_dataframe(data_dir / "instr1_primaryall.csv") result = df.mac.group_by_keep_one(group_by_col="pidn", date_col_name="dcdate", keep="earliest") assert get_option("column.system.duplicates") in result.columns expected_result = file_to_dataframe(data_dir / "instr1_primaryearliest.csv") assert_dfs_equal(result, expected_result, cols_ignore=cols_ignore, output_dir=output_dir)
def merge( left, right, on=None, left_on=None, right_on=None, merge_suffixes=get_option("operators.binary.column_suffixes"), add_suffixes=False, add_indexes=(None, None), ) -> pd.DataFrame: """Merge :class:`pandas.DataFrame` objects with a database-style join, similar to :meth:`pandas.DataFrame.merge`. :param left: DataFrame :param right: the DataFrame to merge with :param on: column(s) to join on. These must be found in both DataFrames. :param left_on: column(s) to join on in the left DataFrame :param right_on: column(s) to join on in the right DataFrame :param merge_suffixes: A length-2 sequence where the first element is suffix to add to the left DataFrame columns, and second element is suffix to add to the right DataFrame columns. Only added if ``add_suffixes`` is ``True``. :param add_suffixes: Whether to add the suffixes specified in ``merge_suffixes`` or not :param add_indexes: A length-2 sequence where each element is optionally a string indicating a top-level index to add to columnn indexes in ``left`` and ``right`` respectively (thus creating a :class:`pandas.MultiIndex` if needed). Pass a value of ``None`` instead of a string to indicate that the column index in ``left`` or ``right`` should be left as-is. At least one of the values must not be ``None``. """ op = _MergeOperation( left, right, on=on, left_on=left_on, right_on=right_on, merge_suffixes=merge_suffixes, add_suffixes=add_suffixes, add_indexes=add_indexes, ) return op.get_result()
def merge( self, right: pd.DataFrame, on=None, left_on=None, right_on=None, merge_suffixes=get_option("operators.binary.column_suffixes"), add_suffixes=False, add_indexes=(None, None), ): """see :meth:`macpie.pandas.merge`""" return operators.merge.merge( self._df, right, on=on, left_on=left_on, right_on=right_on, merge_suffixes=merge_suffixes, add_suffixes=add_suffixes, add_indexes=add_indexes, )
def date_proximity( self, right: pd.DataFrame, id_on=None, id_left_on=None, id_right_on=None, date_on=None, date_left_on=None, date_right_on=None, get: str = "all", when: str = "earlier_or_later", days: int = 90, left_link_id=None, dropna: bool = False, drop_duplicates: bool = False, duplicates_indicator: bool = False, merge="partial", merge_suffixes=get_option("operators.binary.column_suffixes"), ): """see :meth:`macpie.pandas.date_proximity`""" return operators.date_proximity.date_proximity( self._df, right, id_on=id_on, id_left_on=id_left_on, id_right_on=id_right_on, date_on=date_on, date_left_on=date_left_on, date_right_on=date_right_on, get=get, when=when, days=days, left_link_id=left_link_id, dropna=dropna, drop_duplicates=drop_duplicates, duplicates_indicator=duplicates_indicator, merge=merge, merge_suffixes=merge_suffixes, )
def add_secondary(self, dset: Dataset): """Append `dset` to :attr:`MergeableAnchoredList.secondary`.""" if self._secondary_anchor_col not in dset.columns: warnings.warn( f"Warning: Secondary dataset '{dset!r}' does not have " f"anchor column '{self._secondary_anchor_col}'. Skipping...") return dups = dset.duplicated(subset=[self._secondary_anchor_col], keep=False) if dups.any(): dups_col = get_option("column.system.duplicates") if dups_col in dset.sys_cols: dset.rename_col(dups_col, dups_col + "_prior", inplace=True) dset.mac.insert(dups_col, dups) dset.add_tag(Dataset.tag_duplicates) else: dset.add_tag(MergeableAnchoredList.tag_mergeable) super().add_secondary(dset) dset.display_name_generator = MergeableAnchoredList.dataset_display_name_generator self._merged_dset = None
def diff_rows(left: pd.DataFrame, right: pd.DataFrame, cols_ignore=set(), cols_ignore_pat=None): """If ``left`` and ``right`` share the same columns, returns a DataFrame containing rows that differ. :param left: left DataFrame :param right: right DataFrame :param cols_ignore: a list of any columns to ignore """ left = drop_cols(left, cols_list=cols_ignore, cols_pat=cols_ignore_pat) right = drop_cols(right, cols_list=cols_ignore, cols_pat=cols_ignore_pat) left_only_cols, right_only_cols = diff_cols(left, right) if left_only_cols == right_only_cols == set(): indicator_col_name = get_option( "column.system.prefix") + "_diff_rows_merge" if isinstance(left.columns, pd.MultiIndex) or isinstance( right.columns, pd.MultiIndex): # TODO: Doing a pd.merge() on MultiIndex dataframes with indicator # set to True/string resulted in the following error: # pandas.errors.PerformanceWarning: dropping on a non-lexsorted multi-index # without a level parameter may impact performance # Flatten the column MultiIndexes to get around this left.columns = left.columns.to_flat_index() right.columns = right.columns.to_flat_index() merged_df = pd.merge(left, right, indicator=indicator_col_name, how="outer") changed_rows_df = merged_df[merged_df[indicator_col_name] != "both"] return changed_rows_df raise KeyError("Dataframes do not share the same columns")
from pathlib import Path from macpie._config import get_option from macpie.pandas import file_to_dataframe from macpie.testing import assert_dfs_equal current_dir = Path("tests/pandas/operators/date_proximity/dupes/").resolve() # output_dir = current_dir output_dir = None cols_ignore = [ get_option("column.system.abs_diff_days"), get_option("column.system.diff_days"), get_option("column.system.duplicates"), '_abs_diff_days', '_diff_days', '_duplicates' ] def test_dupes(): primary = file_to_dataframe(current_dir / "primary.xlsx") secondary = file_to_dataframe(current_dir / "secondary.xlsx") dupes_result = primary.mac.date_proximity( secondary, id_on='pidn', date_on='dcdate', get='closest', when='earlier_or_later', days=90,
def _get_sheet_name(self, sheet_name): if not sheet_name: sheet_name = get_option("excel.sheet_name.default") return super()._get_sheet_name(sheet_name)
def group_by_keep_one( df: pd.DataFrame, group_by_col: str, date_col_name: str, keep: str = "all", id_col_name: str = None, drop_duplicates: bool = False, ) -> pd.DataFrame: """ Given a :class:`pandas.DataFrame` object, group on the ``group_by_col`` column and keep only the earliest or latest row in each group as determined by the date in the ``date_col_name`` column. :param df: the DataFrame to operate on :param group_by_col: the DataFrame column to group on :param date_col_name: the date column to determine which row is earliest or latest :param keep: specify which row of each group to keep ``all`` keep all rows ``earliest`` in each group, keep only the earliest (i.e. oldest) row ``latest`` in each group, keep only the latest (i.e. most recent) row :param id_col_name: if ``drop_duplicates=True``, the column specified here will also be used for identifying duplicates :param drop_duplicates: if ``True``, then if more than one row is determined to be earliest or or latest in each group, drop all duplicates except the first occurrence. If ``id_col_name`` is specified, then that column will also be used for identifying duplicates """ # groupby.first() and groupby.last() can't handle NaN values (ongoing bug) # use groupby.nth(0) and groupby.nth(-1) instead group_by_col = df.mac.get_col_name(group_by_col) date_col_name = df.mac.to_datetime(date_col_name) if keep not in ["all", "earliest", "latest"]: raise ValueError("invalid keep option") id_col_name = df.mac.get_col_name( id_col_name) if id_col_name is not None else None drop_duplicates = validatortools.validate_bool_kwarg( drop_duplicates, "drop_duplicates") cols = [group_by_col, date_col_name] # first drop rows where group_by col or date col is na and re-index df = df.dropna(subset=cols).reset_index(drop=True) df = df.sort_values(by=cols + [id_col_name] if id_col_name is not None else cols, na_position="last") if keep in {"earliest", "latest"}: pre_results = None if keep == "earliest": pre_results = df.groupby(group_by_col, sort=False, as_index=False).nth(0) else: # keep == 'latest' pre_results = df.groupby(group_by_col, sort=False, as_index=False).nth(-1) # in case there are duplicates, keep them pre_results = pre_results.filter(items=cols) df = pd.merge(df, pre_results, how="inner", on=cols, indicator=False) dup_cols = cols + [id_col_name] if id_col_name is not None else cols dups = df.duplicated(subset=dup_cols, keep=False) if dups.any(): if drop_duplicates: df = df.drop_duplicates(subset=dup_cols, keep="first", ignore_index=True) else: df.mac.insert(get_option("column.system.duplicates"), dups) return df
"--secondary-get", default="all", type=click.Choice(["all", "closest"], case_sensitive=False), ) @click.option("-t", "--secondary-days", default=90) @click.option( "-w", "--secondary-when", default="earlier_or_later", type=click.Choice(["earlier", "later", "earlier_or_later"], case_sensitive=False), ) @click.option( "-i", "--secondary-id-col", default=get_option("dataset.id_col_name"), help="Secondary ID Column Header", ) @click.option( "-d", "--secondary-date-col", default=get_option("dataset.date_col_name"), help="Secondary Date Column Header", ) @click.option( "-j", "--secondary-id2-col", default=get_option("dataset.id2_col_name"), help="Secondary ID2 Column Header", ) @click.option("--merge-results/--no-merge-results", default=True)
def test_any_duplicates(): d = { "col1": ["a", "b", "c"], "col2": [datetime(2001, 3, 2), datetime(2001, 2, 1), datetime(2001, 8, 1)], "col3": [datetime(2002, 3, 2), datetime(2001, 3, 1), datetime(2001, 8, 2)], } df = pd.DataFrame(data=d) # no col4 column raises KeyError with pytest.raises(KeyError): df.mac.any_duplicates("col4") d = { get_option("column.system.duplicates"): [False, False, False], "col2": [datetime(2001, 3, 2), datetime(2001, 2, 1), datetime(2001, 8, 1)], "col3": [datetime(2002, 3, 2), datetime(2001, 3, 1), datetime(2001, 8, 2)], } df = pd.DataFrame(data=d) assert not df[get_option("column.system.duplicates")].any() d = { get_option("column.system.duplicates"): [True, False, False], "col2": [datetime(2001, 3, 2), datetime(2001, 2, 1), datetime(2001, 8, 1)], "col3": [datetime(2002, 3, 2), datetime(2001, 3, 1), datetime(2001, 8, 2)], } df = pd.DataFrame(data=d) assert df.mac.any_duplicates(get_option("column.system.duplicates")) d = { "col1": ["a", "b", "c"], "col2": [None, None, 6], "col3": [np.nan, np.nan, 8] } df = pd.DataFrame(data=d) # nulls by default count as duplicates assert df.mac.any_duplicates("col2") assert df.mac.any_duplicates("col3") # don't count nulls as duplicates assert not df.mac.any_duplicates("col2", ignore_nan=True) assert not df.mac.any_duplicates("col3", ignore_nan=True)
def to_excel( self, excel_writer, sheet_name=None, na_rep="", float_format=None, columns=None, header=True, index=False, index_label=None, startrow=0, startcol=0, engine=None, merge_cells=True, encoding=None, inf_rep="inf", verbose=True, freeze_panes=None, storage_options=None, write_excel_dict=True, highlight_duplicates=True, ) -> None: """Write :class:`Dataset` to an Excel sheet. :param excel_writer: File path or existing ExcelWriter. :param kwargs: """ from macpie.io.formats.excel import MACPieExcelFormatter if isinstance(excel_writer, MACPieExcelWriter): need_save = False else: excel_writer = MACPieExcelWriter( excel_writer, engine=engine, storage_options=storage_options, ) need_save = True try: if isinstance(self, pd.core.dtypes.generic.ABCDataFrame): dset = self else: dset = self.to_frame() formatter = MACPieExcelFormatter( dset, na_rep=na_rep, cols=columns, header=header, float_format=float_format, index=index, index_label=index_label, merge_cells=merge_cells, inf_rep=inf_rep, ) if highlight_duplicates: dups_col_name = get_option("column.system.duplicates") if dups_col_name in self.columns: formatter.highlight_row_by_column_predicate( column=dups_col_name, predicate=bool) if sheet_name is None: sheet_name = self.excel_sheetname formatter.write( excel_writer, sheet_name=sheet_name, startrow=startrow, startcol=startcol, freeze_panes=freeze_panes, ) if write_excel_dict: to_excel_kwargs = { "sheet_name": sheet_name, "na_rep": na_rep, "float_format": float_format, "columns": columns, "header": header, "index": index, "index_label": index_label, "startrow": startrow, "startcol": startcol, "engine": engine, "merge_cells": merge_cells, "encoding": encoding, "inf_rep": inf_rep, "verbose": verbose, "freeze_panes": freeze_panes, "storage_options": storage_options, "write_excel_dict": write_excel_dict, "highlight_duplicates": highlight_duplicates, } if not header: header_col = None elif self.columns.nlevels > 1 and merge_cells: # if merge_cells is False, MultiIndex header will be in # legacy format, whiich is one row with dots to indicate levels. header_col = list(range(0, self.columns.nlevels)) else: header_col = 0 if not index: index_col = None elif self.index.nlevels > 1: index_col = list(range(0, self.index.nlevels)) else: index_col = 0 # ensure successful write/read round-trip read_excel_kwargs = { "header": header_col, "index_col": index_col } excel_dict = self.to_excel_dict() excel_dict["to_excel_kwargs"] = to_excel_kwargs excel_dict["read_excel_kwargs"] = read_excel_kwargs excel_writer.write_excel_dict(excel_dict) finally: # make sure to close opened file handles if need_save: excel_writer.close()
from pathlib import Path import pytest from macpie._config import get_option from macpie.pandas import file_to_dataframe from macpie.testing import assert_dfs_equal data_dir = Path("tests/data/").resolve() current_dir = Path("tests/pandas/operators/group_by_keep_one/").resolve() # output_dir = current_dir output_dir = None cols_ignore = ["link_date", "link_id", get_option("column.system.duplicates")] def test_keep_earliest_csv(): # test earliest df = file_to_dataframe(data_dir / "instr1_primaryall.csv") result = df.mac.group_by_keep_one(group_by_col="pidn", date_col_name="dcdate", keep="earliest") assert get_option("column.system.duplicates") in result.columns expected_result = file_to_dataframe(data_dir / "instr1_primaryearliest.csv") assert_dfs_equal(result,
def __init__( self, left: pd.DataFrame, right: pd.DataFrame, id_on=None, id_left_on=None, id_right_on=None, date_on=None, date_left_on=None, date_right_on=None, get: str = "all", when: str = "earlier_or_later", days: int = 90, left_link_id=None, dropna: bool = False, drop_duplicates: bool = False, duplicates_indicator: bool = False, merge="partial", merge_suffixes=get_option("operators.binary.column_suffixes"), prepend_levels=(None, None), ): self.left = left self.right = right self.id_on = lltools.maybe_make_list(id_on) self.id_left_on = lltools.maybe_make_list(id_left_on) self.id_right_on = lltools.maybe_make_list(id_right_on) self.date_on = date_on self.date_left_on = date_left_on self.date_right_on = date_right_on self.get = get self.when = when self.days = days self.left_link_id = left_link_id self.dropna = validatortools.validate_bool_kwarg(dropna, "dropna") self.drop_duplicates = validatortools.validate_bool_kwarg( drop_duplicates, "drop_duplicates" ) self.duplicates_indicator = duplicates_indicator self.duplicates_indicator_name: Optional[str] if isinstance(self.duplicates_indicator, str): self.duplicates_indicator_name = self.duplicates_indicator elif isinstance(self.duplicates_indicator, bool): self.duplicates_indicator_name = ( get_option("column.system.duplicates") if self.duplicates_indicator else None ) else: raise ValueError("indicator option can only accept boolean or string arguments") self.merge = merge self.merge_suffixes = merge_suffixes self.prepend_levels = prepend_levels self._left_suffix = get_option("operators.binary.column_suffixes")[0] self._right_suffix = get_option("operators.binary.column_suffixes")[1] self._diff_days_col = get_option("column.system.diff_days") self._abs_diff_days_col = get_option("column.system.abs_diff_days") self._merge_indicator_col = get_option("column.system.merge") self._validate_specification()
def date_proximity( left, right, id_on=None, id_left_on=None, id_right_on=None, date_on=None, date_left_on=None, date_right_on=None, get: str = "all", when: str = "earlier_or_later", days: int = 90, left_link_id=None, dropna: bool = False, drop_duplicates: bool = False, duplicates_indicator: bool = False, merge="partial", merge_suffixes=get_option("operators.binary.column_suffixes"), prepend_levels=(None, None), ) -> pd.DataFrame: """Links data across two :class:`pandas.DataFrame` objects by date proximity. Specifically, a "left" DataFrame contains a timepoint anchor, and a "right" DataFrame is linked to the left by retrieving all rows that match on a specified id col, and whose specified date fields are within a certain time range of each other. :param left: the DataFrame containing the timepoint anchor :param right: the DataFrame to link :param id_on: primary column to join on. These must be found in both DataFrames. :param id_left_on: primary column to join on in the left DataFrame :param id_right_on: primary column to join on in the right DataFrame :param date_on: date columns to use for timepoint matching. These must be found in both DataFrames, and the one on the left will act as timepoint anchor. :param date_left_on: date column in left DataFrame to act as timepoint anchor. :param date_right_on: date column in the right DataFrame to compare with left's timepoint anchor :param get: which rows of the right DataFrame to link in reference to the timepoint anchor: ``all`` keep all rows ``closest`` get only the closest row that is within ``days`` days of the right DataFrame timepoint anchor :param when: which rows of the right DataFrame to link in temporal relation to the timepoint anchor ``earlier`` get only rows that are earlier than the timepoint anchor ``later`` get only rows that are lter (more recent) than the timepoint anchor ``earlier_or_later`` get rows that are earlier or later than the timepoint anchor :param days: the time range measured in days :param left_link_id: the id column in the left DataFrame to act as the primary key of that data. This helps to ensure there are no duplicates in the left DataFrame (i.e. rows with the same ``id_left_on`` and ``date_left_on``) :param dropna: whether to exclude rows that did not find any match :param merge: which columns to include in result ``partial`` include only columns from the right DataFrame ``full`` include all columns from both left and right DataFrames :param merge_suffixes: A length-2 sequence where the first element is suffix to add to the left DataFrame columns, and second element is suffix to add to the right DataFrame columns. :param prepend_levels: A length-2 sequence where each element is optionally a string indicating a top-level index to add to columnn indexes in ``left`` and ``right`` respectively (thus creating a :class:`pandas.MultiIndex` if needed). Pass a value of ``None`` instead of a string to indicate that the column index in ``left`` or ``right`` should be left as-is. At least one of the values must not be ``None``. """ op = _DateProximityOperation( left, right, id_on=id_on, id_left_on=id_left_on, id_right_on=id_right_on, date_on=date_on, date_left_on=date_left_on, date_right_on=date_right_on, get=get, when=when, days=days, left_link_id=left_link_id, dropna=dropna, drop_duplicates=drop_duplicates, duplicates_indicator=duplicates_indicator, merge=merge, merge_suffixes=merge_suffixes, prepend_levels=prepend_levels, ) return op.get_result()
from macpie._config import get_option from macpie.pandas import file_to_dataframe from macpie.testing import assert_dfs_equal data_dir = Path("tests/data/").resolve() current_dir = Path(__file__).parent.absolute() # output_dir = current_dir output_dir = None cols_ignore = [ 'PIDN', 'VType', 'DayDiff' ] cols_ignore_pat = '^' + get_option("column.system.prefix") dfs_dict = pd.read_excel( data_dir / "instr2.xlsx", sheet_name=[ 'LINK_INSTR2', 'INSTR1_linked', 'INSTR3_linked' ] ) primary = dfs_dict['LINK_INSTR2'] @pytest.mark.slow def test_secondary_instr1():
def date_proximity( left: mp.Dataset, right: mp.Dataset, get: str = "all", when: str = "earlier_or_later", days: int = 90, dropna: bool = False, drop_duplicates: bool = False, duplicates_indicator: bool = False, merge_suffixes=get_option("operators.binary.column_suffixes"), prepend_level_name: bool = True, ) -> None: """Links data across two :class:`Dataset` objects by date proximity, first joining them on their :attr:`Dataset.id2_col_name`. Specifically, a `left` Dataset contains a timepoint anchor, and a `right` Dataset is linked to the `left` by retrieving all rows that match on :attr:`Dataset.id2_col_name`, and whose :attr:`Dataset.date_col_name` fields are within a certain time range of each other. This is the :class:`Dataset` analog of :func:`macpie.pandas.date_proximity`. :param left: the :class:`Dataset` containing the timepoint anchor :param right: the :class:`Dataset` to link. Its :attr:`Dataset.df` attribute gets updated with the results of this operation :param get: which rows of the right :class:`Dataset` to link in reference to the timepoint anchor: ``all`` keep all rows ``closest`` get only the closest row that is within ``days`` days of the right DataFrame timepoint anchor :param when: which rows of the right Dataset to link in temporal relation to the timepoint anchor ``earlier`` get only rows that are earlier than the timepoint anchor ``later`` get only rows that are lter (more recent) than the timepoint anchor ``earlier_or_later`` get rows that are earlier or later than the timepoint anchor :param days: the time range measured in days :param dropna: whether to exclude rows that did not find any match :param duplicates_indicator: if True, adds a boolean column to the output Dataset called "_mp_duplicates" (True if duplicate, false if not). The column can be given a different name by providing a string argument. :param merge_suffixes: A length-2 sequence where the first element is suffix to add to the left Dataset columns, and second element is suffix to add to the right Dataset columns. """ if prepend_level_name: prepend_levels = (left.name, right.name) else: prepend_levels = (None, None) result_df = mp.pandas.operators.date_proximity.date_proximity( left, right, id_left_on=left.id2_col_name, id_right_on=right.id2_col_name, date_left_on=left.date_col_name, date_right_on=right.date_col_name, get=get, when=when, days=days, left_link_id=left.id_col_name, dropna=dropna, drop_duplicates=drop_duplicates, duplicates_indicator=duplicates_indicator, merge="partial", merge_suffixes=merge_suffixes, prepend_levels=prepend_levels, ) if prepend_level_name: new_id_col_name = (right.name, right.id_col_name) new_date_col_name = (right.name, right.date_col_name) new_id2_col_name = (right.name, right.id2_col_name) else: new_id_col_name = right.id_col_name new_date_col_name = right.date_col_name new_id2_col_name = right.id2_col_name return mp.Dataset( result_df, id_col_name=new_id_col_name, date_col_name=new_date_col_name, id2_col_name=new_id2_col_name, name=right.name, )