Ejemplo n.º 1
0
def test_small_no_merge(cli_link_small_no_merge):
    expected_dict = pd.read_excel(
        current_dir / "small.xlsx",
        sheet_name=["LINK_INSTR1", "INSTR2_linked", "INSTR3_linked"])

    expected_primary = expected_dict["LINK_INSTR1"]
    expected_secondary_instr2 = expected_dict["INSTR2_linked"]
    expected_secondary_instr3 = expected_dict["INSTR3_linked"]

    # copy file to current dir if you want to debug more
    if output_dir is not None:
        copy(cli_link_small_no_merge, current_dir)

    results_dict = pd.read_excel(
        cli_link_small_no_merge,
        sheet_name=["small_anchor", "instr2_all_linked", "instr3_all_linked"],
    )

    result_primary = results_dict["small_anchor"]
    result_secondary_instr2 = results_dict["instr2_all_linked"]
    result_secondary_instr3 = results_dict["instr3_all_linked"]

    assert_dfs_equal(
        result_primary,
        expected_primary,
        cols_ignore=cols_ignore,
        cols_ignore_pat=cols_ignore_pat,
        output_dir=output_dir,
    )

    cols_ignore2 = [
        get_option("column.system.abs_diff_days"),
        get_option("column.system.diff_days"),
        "PIDN",
        "VType",
        "_merge",
    ]

    assert_dfs_equal(
        result_secondary_instr2,
        expected_secondary_instr2,
        cols_ignore=cols_ignore2,
        cols_ignore_pat=cols_ignore_pat,
        output_dir=output_dir,
    )

    assert_dfs_equal(
        result_secondary_instr3,
        expected_secondary_instr3,
        cols_ignore=cols_ignore2,
        cols_ignore_pat=cols_ignore_pat,
        output_dir=output_dir,
    )
Ejemplo n.º 2
0
    def __init__(
        self,
        data=None,
        *args,
        id_col_name=None,
        date_col_name=None,
        id2_col_name=None,
        name=None,
        tags=None,
        display_name_generator=None,
        id_col_dropna=False,
        **kwargs,
    ):
        super().__init__(data, *args, **kwargs)

        self.id_col_name = id_col_name
        self.date_col_name = date_col_name
        self.id2_col_name = id2_col_name
        self.name = name if name else get_option("dataset.default.name")
        self.tags = tags

        self.display_name_generator = (display_name_generator
                                       if display_name_generator else
                                       self.default_display_name_generator)

        if id_col_dropna:
            self.dropna(subset=["id_col_name"], inplace=True)
Ejemplo n.º 3
0
def add_diff_days(df: pd.DataFrame,
                  col_start: str,
                  col_end: str,
                  diff_days_col: str = None,
                  inplace=False):
    """Adds a column to DataFrame called ``_diff_days`` which contains
    the number of days between ``col_start`` and ``col_end``

    :param df: DataFrame
    :param col_start: column containing the start date
    :param col_end: column containing the end date
    """
    if diff_days_col is None:
        diff_days_col = get_option("column.system.diff_days")

    if col_start == col_end:
        raise KeyError(
            "date columns have the same name: {col_start}=={col_end}")

    if not inplace:
        df = df.copy()

    df[diff_days_col] = df[col_end] - df[col_start]
    df[diff_days_col] = df[diff_days_col] / np.timedelta64(1, "D")

    # df.assign(**{diff_days_col: (df[col_end] - df[col_start]) / np.timedelta64(1, "D")})
    if not inplace:
        return df
Ejemplo n.º 4
0
    def date_proximity(
        self,
        right_dset: "Dataset",
        get: str = "all",
        when: str = "earlier_or_later",
        days: int = 90,
        dropna: bool = False,
        drop_duplicates: bool = False,
        duplicates_indicator: bool = False,
        merge_suffixes=get_option("operators.binary.column_suffixes"),
        prepend_level_name: bool = True,
    ) -> None:
        """
        Links data across this :class:`Dataset` and ``right_dset``,
        updating :attr:`df` with the results.

        Calls :func:`macpie.date_proximity`, passing in ``right_dset`` as
        the "left" Dataset, and this Dataset as the "right" Dataset.
        """
        from macpie.operators.date_proximity import date_proximity

        return date_proximity(
            left=self,
            right=right_dset,
            get=get,
            when=when,
            days=days,
            dropna=dropna,
            drop_duplicates=drop_duplicates,
            duplicates_indicator=duplicates_indicator,
            merge_suffixes=merge_suffixes,
            prepend_level_name=prepend_level_name,
        )
Ejemplo n.º 5
0
    def __init__(
            self,
            left: pd.DataFrame,
            right: pd.DataFrame,
            on=None,
            left_on=None,
            right_on=None,
            merge_suffixes=get_option("operators.binary.column_suffixes"),
            add_suffixes=False,
            add_indexes=(None, None),
    ):

        self.left = self.orig_left = left
        self.right = self.orig_right = right

        self.on = lltools.maybe_make_list(on)
        self.left_on = lltools.maybe_make_list(left_on)
        self.right_on = lltools.maybe_make_list(right_on)

        self.merge_suffixes = merge_suffixes

        self.add_suffixes = validatortools.validate_bool_kwarg(
            add_suffixes, "add_suffixes")

        self.add_indexes = add_indexes

        self._validate_specification()
Ejemplo n.º 6
0
def mark_duplicates_by_cols(df: pd.DataFrame, cols: List[str]):
    """Create a column in ``df`` called ``_duplicates`` which is a boolean Series
    denoting duplicate rows as identified by ``cols``.

    :param df: DataFrame
    :param cols: Only consider these columns for identifiying duplicates
    """
    df[get_option("column.system.duplicates")] = df.duplicated(subset=cols,
                                                               keep=False)
    return df
 def primary(self, dset: Dataset):
     """Sets the `primary` Dataset of this collection."""
     AnchoredList.primary.fset(self, dset)
     if self._primary is not None:
         if self._primary_anchor_col is None:
             self._primary_anchor_col = self._primary.id_col_name
         if self._secondary_anchor_col is None:
             self._secondary_anchor_col = (
                 self._primary.id_col_name +
                 get_option("operators.binary.column_suffixes")[0])
Ejemplo n.º 8
0
    def write_tablib_dataset(self, tlset: tl.Dataset, freeze_panes=True):
        sheet_name = (safe_xlsx_sheet_title(tlset.title, "-") if tlset.title
                      else (get_option("excel.sheet_name.default")))

        ws = self.book.add_worksheet(sheet_name, autofit_columns=True)

        xlsxwritertools.tlset_sheet(tlset,
                                    ws,
                                    self.format_bold,
                                    self.format_text_wrap,
                                    freeze_panes=freeze_panes)
Ejemplo n.º 9
0
def test_add_diff_days():
    d = {
        "col1": ["a", "b", "c"],
        "col2":
        [datetime(2001, 3, 2),
         datetime(2001, 2, 1),
         datetime(2001, 8, 1)],
        "col3":
        [datetime(2002, 3, 2),
         datetime(2001, 3, 1),
         datetime(2001, 8, 2)],
    }
    df = pd.DataFrame(data=d)

    with pytest.raises(KeyError):
        df.mac.add_diff_days("col2", "col2")

    df = df.mac.add_diff_days("col2", "col3")

    assert get_option("column.system.diff_days") in df.columns
    assert df[get_option("column.system.diff_days")].equals(
        pd.Series([365.0, 28.0, 1.0]))
Ejemplo n.º 10
0
    def execute(self):
        prim_dset = Dataset.from_file(
            self.primary,
            id_col_name=None,
            date_col_name=self.date_col,
            id2_col_name=self.id2_col,
            name=self.primary.stem,
        )

        if self.id_col in prim_dset.columns:
            prim_dset.id_col_name = self.id_col
        else:
            click.echo(
                "\nWARNING: ID Column Header (-i, --id-col) not specified "
                'and default of "InstrID" not found in your PRIMARY file.')
            click.echo(
                f'         Creating one for you called "{NEW_ID_COL_NAME}"\n')

            prim_dset.create_id_col(col_name=NEW_ID_COL_NAME)

        collection = MergeableAnchoredList(prim_dset)

        if self.secondary:
            for sec in self.secondary:
                try:
                    sec_dset = Dataset.from_file(
                        sec,
                        id_col_name=self.secondary_id_col,
                        date_col_name=self.secondary_date_col,
                        id2_col_name=self.secondary_id2_col,
                        name=sec.stem,
                    )

                    sec_dset_linked = prim_dset.date_proximity(
                        right_dset=sec_dset,
                        get=self.secondary_get,
                        when=self.secondary_when,
                        days=self.secondary_days,
                        merge_suffixes=get_option(
                            "operators.binary.column_suffixes"),
                        prepend_level_name=False,
                    )

                    collection.add_secondary(sec_dset_linked)
                except Exception as e:
                    click.echo(f'\nERROR loading secondary dataset "{sec}"\n')
                    click.echo(e)
                    raise (e)

        self.collection = collection
Ejemplo n.º 11
0
    def __new__(cls, *args, **kwargs):
        # only switch class if generic(MACPieExcelWriter)
        if cls is MACPieExcelWriter:
            engine = kwargs.pop("engine", None)
            if engine is None:
                engine = get_option("excel.writer.engine")
            if engine not in ("mp_xlsxwriter", "mp_openpyxl"):
                raise ValueError(
                    f"Invalid engine: '{engine}''. Only 'mp_xlsxwriter' or 'mp_openpyxl' supported."
                )

            cls = pd.io.excel._util.get_writer(engine)

        return object.__new__(cls)
Ejemplo n.º 12
0
    def sys_cols(self):
        """Returns list of `system` column names of this :class:`Dataset`,
        defined as any columns starting with ``column.system.prefix`` option.
        """
        sys_col_prefix = get_option("column.system.prefix")

        if isinstance(self.columns, pd.MultiIndex):
            return [
                col for col in self.columns
                if col[-1].startswith(sys_col_prefix)
            ]
        else:
            return [
                col for col in self.columns if col.startswith(sys_col_prefix)
            ]
Ejemplo n.º 13
0
def test_keep_earliest_csv():
    # test earliest
    df = file_to_dataframe(data_dir / "instr1_primaryall.csv")

    result = df.mac.group_by_keep_one(group_by_col="pidn",
                                      date_col_name="dcdate",
                                      keep="earliest")

    assert get_option("column.system.duplicates") in result.columns

    expected_result = file_to_dataframe(data_dir /
                                        "instr1_primaryearliest.csv")

    assert_dfs_equal(result,
                     expected_result,
                     cols_ignore=cols_ignore,
                     output_dir=output_dir)
Ejemplo n.º 14
0
def merge(
        left,
        right,
        on=None,
        left_on=None,
        right_on=None,
        merge_suffixes=get_option("operators.binary.column_suffixes"),
        add_suffixes=False,
        add_indexes=(None, None),
) -> pd.DataFrame:
    """Merge :class:`pandas.DataFrame` objects with a database-style join, similar to
    :meth:`pandas.DataFrame.merge`.

    :param left: DataFrame
    :param right: the DataFrame to merge with
    :param on: column(s) to join on. These must be found in both
               DataFrames.
    :param left_on: column(s) to join on in the left DataFrame
    :param right_on: column(s) to join on in the right DataFrame
    :param merge_suffixes: A length-2 sequence where the first element is
                           suffix to add to the left DataFrame columns, and
                           second element is suffix to add to the right DataFrame columns.
                           Only added if ``add_suffixes`` is ``True``.
    :param add_suffixes: Whether to add the suffixes specified in ``merge_suffixes`` or not
    :param add_indexes: A length-2 sequence where each element is optionally a string
                        indicating a top-level index to add to columnn indexes in ``left``
                        and ``right`` respectively (thus creating a :class:`pandas.MultiIndex`
                        if needed). Pass a value of ``None`` instead of a string
                        to indicate that the column index in ``left`` or ``right`` should be
                        left as-is. At least one of the values must not be ``None``.
    """

    op = _MergeOperation(
        left,
        right,
        on=on,
        left_on=left_on,
        right_on=right_on,
        merge_suffixes=merge_suffixes,
        add_suffixes=add_suffixes,
        add_indexes=add_indexes,
    )
    return op.get_result()
Ejemplo n.º 15
0
 def merge(
     self,
     right: pd.DataFrame,
     on=None,
     left_on=None,
     right_on=None,
     merge_suffixes=get_option("operators.binary.column_suffixes"),
     add_suffixes=False,
     add_indexes=(None, None),
 ):
     """see :meth:`macpie.pandas.merge`"""
     return operators.merge.merge(
         self._df,
         right,
         on=on,
         left_on=left_on,
         right_on=right_on,
         merge_suffixes=merge_suffixes,
         add_suffixes=add_suffixes,
         add_indexes=add_indexes,
     )
Ejemplo n.º 16
0
 def date_proximity(
     self,
     right: pd.DataFrame,
     id_on=None,
     id_left_on=None,
     id_right_on=None,
     date_on=None,
     date_left_on=None,
     date_right_on=None,
     get: str = "all",
     when: str = "earlier_or_later",
     days: int = 90,
     left_link_id=None,
     dropna: bool = False,
     drop_duplicates: bool = False,
     duplicates_indicator: bool = False,
     merge="partial",
     merge_suffixes=get_option("operators.binary.column_suffixes"),
 ):
     """see :meth:`macpie.pandas.date_proximity`"""
     return operators.date_proximity.date_proximity(
         self._df,
         right,
         id_on=id_on,
         id_left_on=id_left_on,
         id_right_on=id_right_on,
         date_on=date_on,
         date_left_on=date_left_on,
         date_right_on=date_right_on,
         get=get,
         when=when,
         days=days,
         left_link_id=left_link_id,
         dropna=dropna,
         drop_duplicates=drop_duplicates,
         duplicates_indicator=duplicates_indicator,
         merge=merge,
         merge_suffixes=merge_suffixes,
     )
    def add_secondary(self, dset: Dataset):
        """Append `dset` to :attr:`MergeableAnchoredList.secondary`."""
        if self._secondary_anchor_col not in dset.columns:
            warnings.warn(
                f"Warning: Secondary dataset '{dset!r}' does not have "
                f"anchor column '{self._secondary_anchor_col}'. Skipping...")
            return

        dups = dset.duplicated(subset=[self._secondary_anchor_col], keep=False)
        if dups.any():
            dups_col = get_option("column.system.duplicates")
            if dups_col in dset.sys_cols:
                dset.rename_col(dups_col, dups_col + "_prior", inplace=True)
            dset.mac.insert(dups_col, dups)
            dset.add_tag(Dataset.tag_duplicates)
        else:
            dset.add_tag(MergeableAnchoredList.tag_mergeable)

        super().add_secondary(dset)

        dset.display_name_generator = MergeableAnchoredList.dataset_display_name_generator

        self._merged_dset = None
Ejemplo n.º 18
0
def diff_rows(left: pd.DataFrame,
              right: pd.DataFrame,
              cols_ignore=set(),
              cols_ignore_pat=None):
    """If ``left`` and ``right`` share the same columns, returns a DataFrame
    containing rows that differ.

    :param left: left DataFrame
    :param right: right DataFrame
    :param cols_ignore: a list of any columns to ignore
    """
    left = drop_cols(left, cols_list=cols_ignore, cols_pat=cols_ignore_pat)
    right = drop_cols(right, cols_list=cols_ignore, cols_pat=cols_ignore_pat)

    left_only_cols, right_only_cols = diff_cols(left, right)

    if left_only_cols == right_only_cols == set():
        indicator_col_name = get_option(
            "column.system.prefix") + "_diff_rows_merge"
        if isinstance(left.columns, pd.MultiIndex) or isinstance(
                right.columns, pd.MultiIndex):
            # TODO: Doing a pd.merge() on MultiIndex dataframes with indicator
            # set to True/string resulted in the following error:
            # pandas.errors.PerformanceWarning: dropping on a non-lexsorted multi-index
            # without a level parameter may impact performance
            # Flatten the column MultiIndexes to get around this
            left.columns = left.columns.to_flat_index()
            right.columns = right.columns.to_flat_index()
        merged_df = pd.merge(left,
                             right,
                             indicator=indicator_col_name,
                             how="outer")
        changed_rows_df = merged_df[merged_df[indicator_col_name] != "both"]
        return changed_rows_df

    raise KeyError("Dataframes do not share the same columns")
Ejemplo n.º 19
0
from pathlib import Path

from macpie._config import get_option
from macpie.pandas import file_to_dataframe
from macpie.testing import assert_dfs_equal

current_dir = Path("tests/pandas/operators/date_proximity/dupes/").resolve()

# output_dir = current_dir
output_dir = None

cols_ignore = [
    get_option("column.system.abs_diff_days"),
    get_option("column.system.diff_days"),
    get_option("column.system.duplicates"),
    '_abs_diff_days', '_diff_days', '_duplicates'
]


def test_dupes():

    primary = file_to_dataframe(current_dir / "primary.xlsx")
    secondary = file_to_dataframe(current_dir / "secondary.xlsx")

    dupes_result = primary.mac.date_proximity(
        secondary,
        id_on='pidn',
        date_on='dcdate',
        get='closest',
        when='earlier_or_later',
        days=90,
Ejemplo n.º 20
0
 def _get_sheet_name(self, sheet_name):
     if not sheet_name:
         sheet_name = get_option("excel.sheet_name.default")
     return super()._get_sheet_name(sheet_name)
Ejemplo n.º 21
0
def group_by_keep_one(
    df: pd.DataFrame,
    group_by_col: str,
    date_col_name: str,
    keep: str = "all",
    id_col_name: str = None,
    drop_duplicates: bool = False,
) -> pd.DataFrame:
    """
    Given a :class:`pandas.DataFrame` object, group on the ``group_by_col`` column
    and keep only the earliest or latest row in each group as determined by the date
    in the ``date_col_name`` column.

    :param df: the DataFrame to operate on
    :param group_by_col: the DataFrame column to group on
    :param date_col_name: the date column to determine which row is earliest or latest
    :param keep: specify which row of each group to keep

        ``all``
             keep all rows

        ``earliest``
             in each group, keep only the earliest (i.e. oldest) row

        ``latest``
             in each group, keep only the latest (i.e. most recent) row

    :param id_col_name: if ``drop_duplicates=True``, the column specified
                   here will also be used for identifying duplicates
    :param drop_duplicates: if ``True``, then if more than one row is determined to be
                            earliest or or latest in each group, drop all duplicates
                            except the first occurrence. If ``id_col_name`` is specified,
                            then that column will also be used for identifying duplicates
    """

    # groupby.first() and groupby.last() can't handle NaN values (ongoing bug)
    # use groupby.nth(0) and groupby.nth(-1) instead

    group_by_col = df.mac.get_col_name(group_by_col)

    date_col_name = df.mac.to_datetime(date_col_name)

    if keep not in ["all", "earliest", "latest"]:
        raise ValueError("invalid keep option")

    id_col_name = df.mac.get_col_name(
        id_col_name) if id_col_name is not None else None

    drop_duplicates = validatortools.validate_bool_kwarg(
        drop_duplicates, "drop_duplicates")

    cols = [group_by_col, date_col_name]

    # first drop rows where group_by col or date col is na and re-index
    df = df.dropna(subset=cols).reset_index(drop=True)

    df = df.sort_values(by=cols +
                        [id_col_name] if id_col_name is not None else cols,
                        na_position="last")

    if keep in {"earliest", "latest"}:
        pre_results = None

        if keep == "earliest":
            pre_results = df.groupby(group_by_col, sort=False,
                                     as_index=False).nth(0)
        else:
            # keep == 'latest'
            pre_results = df.groupby(group_by_col, sort=False,
                                     as_index=False).nth(-1)

        # in case there are duplicates, keep them

        pre_results = pre_results.filter(items=cols)
        df = pd.merge(df, pre_results, how="inner", on=cols, indicator=False)

    dup_cols = cols + [id_col_name] if id_col_name is not None else cols
    dups = df.duplicated(subset=dup_cols, keep=False)
    if dups.any():
        if drop_duplicates:
            df = df.drop_duplicates(subset=dup_cols,
                                    keep="first",
                                    ignore_index=True)
        else:
            df.mac.insert(get_option("column.system.duplicates"), dups)

    return df
Ejemplo n.º 22
0
    "--secondary-get",
    default="all",
    type=click.Choice(["all", "closest"], case_sensitive=False),
)
@click.option("-t", "--secondary-days", default=90)
@click.option(
    "-w",
    "--secondary-when",
    default="earlier_or_later",
    type=click.Choice(["earlier", "later", "earlier_or_later"],
                      case_sensitive=False),
)
@click.option(
    "-i",
    "--secondary-id-col",
    default=get_option("dataset.id_col_name"),
    help="Secondary ID Column Header",
)
@click.option(
    "-d",
    "--secondary-date-col",
    default=get_option("dataset.date_col_name"),
    help="Secondary Date Column Header",
)
@click.option(
    "-j",
    "--secondary-id2-col",
    default=get_option("dataset.id2_col_name"),
    help="Secondary ID2 Column Header",
)
@click.option("--merge-results/--no-merge-results", default=True)
Ejemplo n.º 23
0
def test_any_duplicates():
    d = {
        "col1": ["a", "b", "c"],
        "col2":
        [datetime(2001, 3, 2),
         datetime(2001, 2, 1),
         datetime(2001, 8, 1)],
        "col3":
        [datetime(2002, 3, 2),
         datetime(2001, 3, 1),
         datetime(2001, 8, 2)],
    }
    df = pd.DataFrame(data=d)

    # no col4 column raises KeyError
    with pytest.raises(KeyError):
        df.mac.any_duplicates("col4")

    d = {
        get_option("column.system.duplicates"): [False, False, False],
        "col2":
        [datetime(2001, 3, 2),
         datetime(2001, 2, 1),
         datetime(2001, 8, 1)],
        "col3":
        [datetime(2002, 3, 2),
         datetime(2001, 3, 1),
         datetime(2001, 8, 2)],
    }
    df = pd.DataFrame(data=d)

    assert not df[get_option("column.system.duplicates")].any()

    d = {
        get_option("column.system.duplicates"): [True, False, False],
        "col2":
        [datetime(2001, 3, 2),
         datetime(2001, 2, 1),
         datetime(2001, 8, 1)],
        "col3":
        [datetime(2002, 3, 2),
         datetime(2001, 3, 1),
         datetime(2001, 8, 2)],
    }
    df = pd.DataFrame(data=d)

    assert df.mac.any_duplicates(get_option("column.system.duplicates"))

    d = {
        "col1": ["a", "b", "c"],
        "col2": [None, None, 6],
        "col3": [np.nan, np.nan, 8]
    }
    df = pd.DataFrame(data=d)

    # nulls by default count as duplicates
    assert df.mac.any_duplicates("col2")
    assert df.mac.any_duplicates("col3")

    # don't count nulls as duplicates
    assert not df.mac.any_duplicates("col2", ignore_nan=True)
    assert not df.mac.any_duplicates("col3", ignore_nan=True)
Ejemplo n.º 24
0
    def to_excel(
        self,
        excel_writer,
        sheet_name=None,
        na_rep="",
        float_format=None,
        columns=None,
        header=True,
        index=False,
        index_label=None,
        startrow=0,
        startcol=0,
        engine=None,
        merge_cells=True,
        encoding=None,
        inf_rep="inf",
        verbose=True,
        freeze_panes=None,
        storage_options=None,
        write_excel_dict=True,
        highlight_duplicates=True,
    ) -> None:
        """Write :class:`Dataset` to an Excel sheet.

        :param excel_writer: File path or existing ExcelWriter.
        :param kwargs:
        """
        from macpie.io.formats.excel import MACPieExcelFormatter

        if isinstance(excel_writer, MACPieExcelWriter):
            need_save = False
        else:
            excel_writer = MACPieExcelWriter(
                excel_writer,
                engine=engine,
                storage_options=storage_options,
            )
            need_save = True

        try:
            if isinstance(self, pd.core.dtypes.generic.ABCDataFrame):
                dset = self
            else:
                dset = self.to_frame()

            formatter = MACPieExcelFormatter(
                dset,
                na_rep=na_rep,
                cols=columns,
                header=header,
                float_format=float_format,
                index=index,
                index_label=index_label,
                merge_cells=merge_cells,
                inf_rep=inf_rep,
            )

            if highlight_duplicates:
                dups_col_name = get_option("column.system.duplicates")
                if dups_col_name in self.columns:
                    formatter.highlight_row_by_column_predicate(
                        column=dups_col_name, predicate=bool)

            if sheet_name is None:
                sheet_name = self.excel_sheetname

            formatter.write(
                excel_writer,
                sheet_name=sheet_name,
                startrow=startrow,
                startcol=startcol,
                freeze_panes=freeze_panes,
            )

            if write_excel_dict:
                to_excel_kwargs = {
                    "sheet_name": sheet_name,
                    "na_rep": na_rep,
                    "float_format": float_format,
                    "columns": columns,
                    "header": header,
                    "index": index,
                    "index_label": index_label,
                    "startrow": startrow,
                    "startcol": startcol,
                    "engine": engine,
                    "merge_cells": merge_cells,
                    "encoding": encoding,
                    "inf_rep": inf_rep,
                    "verbose": verbose,
                    "freeze_panes": freeze_panes,
                    "storage_options": storage_options,
                    "write_excel_dict": write_excel_dict,
                    "highlight_duplicates": highlight_duplicates,
                }

                if not header:
                    header_col = None
                elif self.columns.nlevels > 1 and merge_cells:
                    # if merge_cells is False, MultiIndex header will be in
                    # legacy format, whiich is one row with dots to indicate levels.
                    header_col = list(range(0, self.columns.nlevels))
                else:
                    header_col = 0

                if not index:
                    index_col = None
                elif self.index.nlevels > 1:
                    index_col = list(range(0, self.index.nlevels))
                else:
                    index_col = 0

                # ensure successful write/read round-trip
                read_excel_kwargs = {
                    "header": header_col,
                    "index_col": index_col
                }

                excel_dict = self.to_excel_dict()
                excel_dict["to_excel_kwargs"] = to_excel_kwargs
                excel_dict["read_excel_kwargs"] = read_excel_kwargs
                excel_writer.write_excel_dict(excel_dict)
        finally:
            # make sure to close opened file handles
            if need_save:
                excel_writer.close()
Ejemplo n.º 25
0
from pathlib import Path

import pytest

from macpie._config import get_option
from macpie.pandas import file_to_dataframe
from macpie.testing import assert_dfs_equal

data_dir = Path("tests/data/").resolve()
current_dir = Path("tests/pandas/operators/group_by_keep_one/").resolve()

# output_dir = current_dir
output_dir = None

cols_ignore = ["link_date", "link_id", get_option("column.system.duplicates")]


def test_keep_earliest_csv():
    # test earliest
    df = file_to_dataframe(data_dir / "instr1_primaryall.csv")

    result = df.mac.group_by_keep_one(group_by_col="pidn",
                                      date_col_name="dcdate",
                                      keep="earliest")

    assert get_option("column.system.duplicates") in result.columns

    expected_result = file_to_dataframe(data_dir /
                                        "instr1_primaryearliest.csv")

    assert_dfs_equal(result,
Ejemplo n.º 26
0
    def __init__(
        self,
        left: pd.DataFrame,
        right: pd.DataFrame,
        id_on=None,
        id_left_on=None,
        id_right_on=None,
        date_on=None,
        date_left_on=None,
        date_right_on=None,
        get: str = "all",
        when: str = "earlier_or_later",
        days: int = 90,
        left_link_id=None,
        dropna: bool = False,
        drop_duplicates: bool = False,
        duplicates_indicator: bool = False,
        merge="partial",
        merge_suffixes=get_option("operators.binary.column_suffixes"),
        prepend_levels=(None, None),
    ):
        self.left = left
        self.right = right

        self.id_on = lltools.maybe_make_list(id_on)
        self.id_left_on = lltools.maybe_make_list(id_left_on)
        self.id_right_on = lltools.maybe_make_list(id_right_on)

        self.date_on = date_on
        self.date_left_on = date_left_on
        self.date_right_on = date_right_on

        self.get = get
        self.when = when
        self.days = days

        self.left_link_id = left_link_id

        self.dropna = validatortools.validate_bool_kwarg(dropna, "dropna")
        self.drop_duplicates = validatortools.validate_bool_kwarg(
            drop_duplicates, "drop_duplicates"
        )

        self.duplicates_indicator = duplicates_indicator

        self.duplicates_indicator_name: Optional[str]
        if isinstance(self.duplicates_indicator, str):
            self.duplicates_indicator_name = self.duplicates_indicator
        elif isinstance(self.duplicates_indicator, bool):
            self.duplicates_indicator_name = (
                get_option("column.system.duplicates") if self.duplicates_indicator else None
            )
        else:
            raise ValueError("indicator option can only accept boolean or string arguments")

        self.merge = merge
        self.merge_suffixes = merge_suffixes
        self.prepend_levels = prepend_levels

        self._left_suffix = get_option("operators.binary.column_suffixes")[0]
        self._right_suffix = get_option("operators.binary.column_suffixes")[1]
        self._diff_days_col = get_option("column.system.diff_days")
        self._abs_diff_days_col = get_option("column.system.abs_diff_days")
        self._merge_indicator_col = get_option("column.system.merge")

        self._validate_specification()
Ejemplo n.º 27
0
def date_proximity(
    left,
    right,
    id_on=None,
    id_left_on=None,
    id_right_on=None,
    date_on=None,
    date_left_on=None,
    date_right_on=None,
    get: str = "all",
    when: str = "earlier_or_later",
    days: int = 90,
    left_link_id=None,
    dropna: bool = False,
    drop_duplicates: bool = False,
    duplicates_indicator: bool = False,
    merge="partial",
    merge_suffixes=get_option("operators.binary.column_suffixes"),
    prepend_levels=(None, None),
) -> pd.DataFrame:
    """Links data across two :class:`pandas.DataFrame` objects by date proximity.

    Specifically, a "left" DataFrame contains a timepoint anchor, and a "right" DataFrame
    is linked to the left by retrieving all rows that match on a specified id col, and
    whose specified date fields are within a certain time range of each other.

    :param left: the DataFrame containing the timepoint anchor
    :param right: the DataFrame to link
    :param id_on: primary column to join on. These must be found in both
                  DataFrames.
    :param id_left_on: primary column to join on in the left DataFrame
    :param id_right_on: primary column to join on in the right DataFrame
    :param date_on: date columns to use for timepoint matching. These must
                    be found in both DataFrames, and the one on the left
                    will act as timepoint anchor.
    :param date_left_on: date column in left DataFrame to act as timepoint anchor.
    :param date_right_on: date column in the right DataFrame to compare with left's
                          timepoint anchor
    :param get: which rows of the right DataFrame to link in reference to the
                timepoint anchor:

        ``all``
             keep all rows

        ``closest``
             get only the closest row that is within ``days`` days of the
             right DataFrame timepoint anchor

    :param when: which rows of the right DataFrame to link in temporal relation
                 to the timepoint anchor

        ``earlier``
             get only rows that are earlier than the timepoint anchor

        ``later``
             get only rows that are lter (more recent) than the timepoint anchor

        ``earlier_or_later``
             get rows that are earlier or later than the timepoint anchor

    :param days: the time range measured in days
    :param left_link_id: the id column in the left DataFrame to act as the
                         primary key of that data. This helps to ensure there
                         are no duplicates in the left DataFrame (i.e. rows with
                         the same ``id_left_on`` and ``date_left_on``)
    :param dropna: whether to exclude rows that did not find any match
    :param merge: which columns to include in result

        ``partial``
             include only columns from the right DataFrame

        ``full``
             include all columns from both left and right DataFrames

    :param merge_suffixes: A length-2 sequence where the first element is
                           suffix to add to the left DataFrame columns, and
                           second element is suffix to add to the right DataFrame columns.
    :param prepend_levels: A length-2 sequence where each element is optionally a string
                           indicating a top-level index to add to columnn indexes in ``left``
                           and ``right`` respectively (thus creating a :class:`pandas.MultiIndex`
                           if needed). Pass a value of ``None`` instead of a string
                           to indicate that the column index in ``left`` or ``right`` should be
                           left as-is. At least one of the values must not be ``None``.

    """
    op = _DateProximityOperation(
        left,
        right,
        id_on=id_on,
        id_left_on=id_left_on,
        id_right_on=id_right_on,
        date_on=date_on,
        date_left_on=date_left_on,
        date_right_on=date_right_on,
        get=get,
        when=when,
        days=days,
        left_link_id=left_link_id,
        dropna=dropna,
        drop_duplicates=drop_duplicates,
        duplicates_indicator=duplicates_indicator,
        merge=merge,
        merge_suffixes=merge_suffixes,
        prepend_levels=prepend_levels,
    )
    return op.get_result()
Ejemplo n.º 28
0
from macpie._config import get_option
from macpie.pandas import file_to_dataframe
from macpie.testing import assert_dfs_equal


data_dir = Path("tests/data/").resolve()
current_dir = Path(__file__).parent.absolute()

# output_dir = current_dir
output_dir = None

cols_ignore = [
    'PIDN', 'VType', 'DayDiff'
]
cols_ignore_pat = '^' + get_option("column.system.prefix")

dfs_dict = pd.read_excel(
    data_dir / "instr2.xlsx",
    sheet_name=[
        'LINK_INSTR2',
        'INSTR1_linked',
        'INSTR3_linked'
    ]
)

primary = dfs_dict['LINK_INSTR2']


@pytest.mark.slow
def test_secondary_instr1():
Ejemplo n.º 29
0
def date_proximity(
    left: mp.Dataset,
    right: mp.Dataset,
    get: str = "all",
    when: str = "earlier_or_later",
    days: int = 90,
    dropna: bool = False,
    drop_duplicates: bool = False,
    duplicates_indicator: bool = False,
    merge_suffixes=get_option("operators.binary.column_suffixes"),
    prepend_level_name: bool = True,
) -> None:
    """Links data across two :class:`Dataset` objects by date proximity, first joining
    them on their :attr:`Dataset.id2_col_name`.

    Specifically, a `left` Dataset contains a timepoint anchor, and a `right` Dataset
    is linked to the `left` by retrieving all rows that match on :attr:`Dataset.id2_col_name`, and
    whose :attr:`Dataset.date_col_name` fields are within a certain time range of each other.

    This is the :class:`Dataset` analog of :func:`macpie.pandas.date_proximity`.

    :param left: the :class:`Dataset` containing the timepoint anchor
    :param right: the :class:`Dataset` to link. Its :attr:`Dataset.df` attribute gets updated with
                  the results of this operation
    :param get: which rows of the right :class:`Dataset` to link in reference to the
                timepoint anchor:

        ``all``
             keep all rows

        ``closest``
             get only the closest row that is within ``days`` days of the
             right DataFrame timepoint anchor

    :param when: which rows of the right Dataset to link in temporal relation
                 to the timepoint anchor

        ``earlier``
             get only rows that are earlier than the timepoint anchor

        ``later``
             get only rows that are lter (more recent) than the timepoint anchor

        ``earlier_or_later``
             get rows that are earlier or later than the timepoint anchor

    :param days: the time range measured in days
    :param dropna: whether to exclude rows that did not find any match
    :param duplicates_indicator: if True, adds a boolean column to the output Dataset called
                                 "_mp_duplicates" (True if duplicate, false if not). The column
                                 can be given a different name by providing a string argument.
    :param merge_suffixes: A length-2 sequence where the first element is
                           suffix to add to the left Dataset columns, and
                           second element is suffix to add to the right Dataset columns.
    """

    if prepend_level_name:
        prepend_levels = (left.name, right.name)
    else:
        prepend_levels = (None, None)

    result_df = mp.pandas.operators.date_proximity.date_proximity(
        left,
        right,
        id_left_on=left.id2_col_name,
        id_right_on=right.id2_col_name,
        date_left_on=left.date_col_name,
        date_right_on=right.date_col_name,
        get=get,
        when=when,
        days=days,
        left_link_id=left.id_col_name,
        dropna=dropna,
        drop_duplicates=drop_duplicates,
        duplicates_indicator=duplicates_indicator,
        merge="partial",
        merge_suffixes=merge_suffixes,
        prepend_levels=prepend_levels,
    )

    if prepend_level_name:
        new_id_col_name = (right.name, right.id_col_name)
        new_date_col_name = (right.name, right.date_col_name)
        new_id2_col_name = (right.name, right.id2_col_name)
    else:
        new_id_col_name = right.id_col_name
        new_date_col_name = right.date_col_name
        new_id2_col_name = right.id2_col_name

    return mp.Dataset(
        result_df,
        id_col_name=new_id_col_name,
        date_col_name=new_date_col_name,
        id2_col_name=new_id2_col_name,
        name=right.name,
    )