def _infer_abst_var_names(end_visits, abst_var_names, prefix=''):
     if abst_var_names == 'infer':
         abst_names = [f"{prefix}_v{end_visit}" for end_visit in end_visits]
     else:
         abst_names = AbstinenceCalculator._listize_args(abst_var_names)
     if len(end_visits) != len(abst_names):
         raise InputArgumentError(
             "The number of abstinence variable names should match the number of visits."
         )
     return abst_names
Example #2
0
    def interpolate_biochemical_data(self, half_life_in_days,
                                     maximum_days_to_interpolate):
        """
        Interpolate the biochemical data

        :param half_life_in_days: Union[float, int]
            The half-life of the biochemical measure to interpolate the days preceding a non-abstinent day

        :param maximum_days_to_interpolate: int, The maximum number of days to interpolate

        :return: The number of interpolated records
        """
        if half_life_in_days <= 0 or maximum_days_to_interpolate < 1 or \
                not isinstance(maximum_days_to_interpolate, int):
            InputArgumentError(
                "The half life of the biochemical should be greater than 0, and the maximum number of "
                "days to interpolate should be a positive integer")

        if 'imputation_code' in self.data.columns:
            self.data = self.data[self.data['imputation_code'] == 0]

        self.data.sort_values(by=self._index_keys,
                              inplace=True,
                              ignore_index=True)
        self.data['imputation_code'] = DataImputationCode.RAW.value

        interpolated_records = []

        for subject_id in sorted(self.subject_ids):
            subject_data = self.data[self.data['id'] == subject_id]
            interpolated_dates = set()
            for row in subject_data.itertuples():
                current_amount = row.amount
                current_date = row.date
                for day in range(1, maximum_days_to_interpolate + 1):
                    interpolated_date = current_date + timedelta(days=-day)
                    if interpolated_date in interpolated_dates or current_amount <= self.abst_cutoff:
                        continue
                    interpolated_amount = pow(
                        2, day / half_life_in_days) * current_amount
                    interpolated_record = \
                        [subject_id, interpolated_date, interpolated_amount, DataImputationCode.IMPUTED.value]
                    interpolated_dates.add(interpolated_date)
                    interpolated_records.append(interpolated_record)

        interpolated_df = pd.DataFrame(
            interpolated_records,
            columns=[*self._index_keys, 'amount', 'imputation_code'])
        self.data = pd.concat([self.data, interpolated_df
                               ]).sort_values(by=self._index_keys,
                                              ignore_index=True)
        return len(interpolated_df)
Example #3
0
    def check_duplicates(self, duplicate_kept="mean"):
        """
        Check duplicate records, which are identified using id and date for the TLFB data and using id and visit for
        the visit data

        :param duplicate_kept: Union["min", "max", "mean", False, None], specify the action with duplicate records
            "max": default option, keep the records with the maximal value
            "min": keep the records with the minimal value
            "mean": drop duplicate records and replace them with mean values
            False: remove all duplicate records
            None: no actions on duplicate records
        :return: a DataFrame of duplicate records
        """
        if duplicate_kept not in ("min", "max", "mean", False, None):
            raise InputArgumentError(
                "Please specify how you deal with the duplicates, min, max, mean, False, or None."
            )

        duplicated_indices = self.data.duplicated(self._index_keys, keep=False)
        duplicates = self.data.loc[duplicated_indices, :].sort_values(
            by=self._index_keys, ignore_index=True)

        if duplicate_kept is None or duplicates.empty:
            return 0

        if duplicate_kept in ("min", "max"):
            self.data.sort_values(by=[*self._index_keys, self._value_key],
                                  inplace=True)
            self.data.drop_duplicates(
                self._index_keys,
                keep="first" if duplicate_kept == "min" else "last",
                inplace=True,
                ignore_index=True)
        elif duplicate_kept == "mean":
            df = self.data.drop_duplicates(self._index_keys, keep=False)
            duplicates_means = duplicates.groupby(
                self._index_keys)[self._value_key].mean().reset_index()
            self.data = pd.concat([df, duplicates_means], axis=0)
            self.data.sort_values(by=self._index_keys,
                                  inplace=True,
                                  ignore_index=True)
        else:
            self.data.drop_duplicates(self._index_keys,
                                      keep=False,
                                      inplace=True)

        return duplicates
Example #4
0
def mask_dates(tlfb_filepath, bio_filepath, visit_filepath, reference):
    """
    Mask the dates in the visit and TLFB datasets using the anchor visit

    :param tlfb_filepath: the DataFrame object or filepath to the TLFB data (e.g., csv, xls, xlsx)

    :param bio_filepath: the DataFrame object or filepath to the biochemical data (e.g., csv, xls, xlsx)

    :param visit_filepath: the DataFrame object or filepath to the visit data (e.g., csv, xls, xlsx)

    :param reference: the anchor visit [it must be one member of visit list]
        or arbitrary date Union[str "mm/dd/yyyy", datetime.date] using which to mask the dates in these datasets

    :return: two DataFrames for visit and TLFB, respectively
    """
    visit_df = read_data_from_path(visit_filepath)
    visit_df['date'] = pd.to_datetime(visit_df['date'],
                                      infer_datetime_format=True)
    tlfb_dfs = list()
    for filepath in filter(lambda x: x is not None,
                           (tlfb_filepath, bio_filepath)):
        tlfb_df = read_data_from_path(filepath)
        tlfb_df['date'] = pd.to_datetime(tlfb_df['date'],
                                         infer_datetime_format=True)
        tlfb_dfs.append(tlfb_df)

    if reference in visit_df['visit'].unique():
        anchor_visit = reference
        anchor_dates = visit_df.loc[visit_df['visit'] == anchor_visit, ["id", "date"]]. \
            rename(columns={"date": "anchor_date"})

        tlfb_dfs_anchored = list()
        for tlfb_df in tlfb_dfs:
            tlfb_df_anchored = tlfb_df.merge(anchor_dates, on="id")
            tlfb_df_anchored['date'] = (
                tlfb_df_anchored['date'] - tlfb_df_anchored['anchor_date']
            ).map(lambda x: x.days if pd.notnull(x) else np.nan)
            tlfb_dfs_anchored.append(
                tlfb_df_anchored.drop("anchor_date", axis=1))

        visit_df_anchored = visit_df.merge(anchor_dates, on="id")
        visit_df_anchored['date'] = (
            visit_df_anchored['date'] - visit_df_anchored['anchor_date']
        ).map(lambda x: x.days if pd.notnull(x) else np.nan)
        return tuple(
            (*tlfb_dfs_anchored, visit_df_anchored.drop("anchor_date",
                                                        axis=1)))
    else:
        try:
            reference_date = pd.to_datetime(reference,
                                            infer_datetime_format=True)
        except TypeError:
            raise InputArgumentError(
                "You're expecting to pass a date object or string as a reference."
            )
        else:
            for tlfb_df in tlfb_dfs:
                tlfb_df['date'] = (tlfb_df['date'] -
                                   reference_date).map(lambda x: x.days)
            visit_df['date'] = (visit_df['date'] -
                                reference_date).map(lambda x: x.days)
        return tuple((*tlfb_dfs, visit_df))
Example #5
0
    def impute_data(self,
                    impute="linear",
                    last_record_action="ffill",
                    maximum_allowed_gap_days=None,
                    biochemical_data=None,
                    overridden_amount="infer"):
        """
        Impute the TLFB data

        :param impute: Union["uniform", "linear", None, int, float], how the missing TLFB data are imputed
            None: no imputation
            "uniform": impute the missing TLFB data using the mean value of the amounts before and after
            the missing interval
            "linear" (the default): impute the missing TLFB data by interpolating a linear trend based on the amounts
            before and after the missing interval
            Numeric value (int or float): impute the missing TLFB data using the specified value

        :param last_record_action: Union[int, float, "ffill", None]
            to interpolate one more record from the last record (presumably the day before the last visit), this action
            is useful when you compute abstinence data involving the last visit, which may have missing data on the TLFB
            data.
            "ffill" (the default): generate one more record with the same amount of substance use as the last record
            None: no actions with the last records
            int, float: a numeric value for interpolation all subjects' last records

        :type maximum_allowed_gap_days: None or int
        :param maximum_allowed_gap_days: When it's none, there is no limit on the length of the missing gap. If it's
            set (e.g., 90 days), when the missing gap exceeds the limit, even if the TLFB records at the start and
            end of the missing block indicate no substance use, the calculator will still impute the entire window as
            using substance.

        :type biochemical_data: TLFBData
        :param biochemical_data: The biochemical data that is used to impute the self-reported data

        :param overridden_amount: Union[float, int, 'infer']
            The default is 'infer', which means that when the particular date's record exceeds the biochemical
            abstinence cutoff, while its self-reported use is below the self-reported cutoff, the self-reported use
            is interpolated as 1 unit above the self-report cutoff. For instance, in most scenarios, the self-reported
            cutoff is 0, then the self-reported use will be interpolated as 1 when the biochemical value is above the
            cutoff while the patient's self-reported use is 0.
            You can also specify what other values to be used for the interpolation when the above described condition
            is met.

        :return: Summary of the imputation
        """
        if impute is None or str(impute).lower() == "none":
            return
        if not (impute in ("uniform", "linear") or str(impute).isnumeric()):
            raise InputArgumentError(
                "The imputation mode can only be None, 'uniform', 'linear', "
                "or a numeric value.")
        if 'imputation_code' in self.data.columns:
            self.data = self.data[self.data['imputation_code'] == 0]

        self.data.sort_values(by=self._index_keys,
                              inplace=True,
                              ignore_index=True)
        self.data['imputation_code'] = DataImputationCode.RAW.value

        to_concat = list()

        if last_record_action is not None:
            last_records = self.data.loc[self.data.groupby('id')
                                         ['date'].idxmax()].copy()
            last_records['date'] = last_records['date'].map(
                lambda x: x + timedelta(days=1))
            if last_record_action != 'ffill':
                last_records['amount'] = float(last_record_action)
            last_records['imputation_code'] = DataImputationCode.IMPUTED.value
            to_concat.append(last_records)

        if biochemical_data is not None:
            if "imputation_code" in biochemical_data.data.columns:
                _biochemical_data = \
                    biochemical_data.data.drop(columns="imputation_code").rename(columns={"amount": "bio_amount"})
            else:
                _biochemical_data = \
                    biochemical_data.data.rename(columns={"amount": "bio_amount"})
            _merged = self.data.merge(_biochemical_data,
                                      how="left",
                                      on=self._index_keys)
            bio_amount = (self.abst_cutoff +
                          1) if overridden_amount == 'infer' else float(
                              overridden_amount)

            def interpolate_tlfb(row):
                amount = row['amount']
                imputation_code = DataImputationCode.RAW.value
                if pd.isnull(row['bio_amount']):
                    return pd.Series([amount, imputation_code])
                if row['amount'] <= self.abst_cutoff and row[
                        'bio_amount'] > biochemical_data.abst_cutoff:
                    amount = bio_amount
                    imputation_code = DataImputationCode.OVERRIDDEN.value
                return pd.Series([amount, imputation_code])

            _merged[['amount',
                     'imputation_code']] = _merged.apply(interpolate_tlfb,
                                                         axis=1)
            self.data = _merged.drop(columns="bio_amount")

        self.data['diff_days'] = self.data.groupby([
            'id'
        ])['date'].diff().map(lambda x: (x.days if self.use_raw_date else x)
                              if pd.notnull(x) else 1)
        missing_data = self.data[self.data['diff_days'] > 1.0]
        imputed_records = []
        for data_row in missing_data.itertuples():
            start_data = self.data.iloc[data_row.Index - 1]
            start_record = TLFBRecord(start_data.id, start_data.date,
                                      start_data.amount,
                                      start_data.imputation_code)
            end_record = TLFBRecord(data_row.id, data_row.date,
                                    data_row.amount,
                                    start_data.imputation_code)
            imputed_records.extend(
                self._impute_missing_block(start_record, end_record, impute,
                                           maximum_allowed_gap_days))
        self.data.drop(['diff_days'], axis=1, inplace=True)
        imputed_tlfb_data = pd.DataFrame(imputed_records)

        to_concat.append(self.data)
        to_concat.append(imputed_tlfb_data)

        self.data = pd.concat(to_concat).sort_values(self._index_keys,
                                                     ignore_index=True)
        impute_summary = self.data.groupby(['imputation_code']).size().reset_index(). \
            rename({0: "record_count"}, axis=1)
        impute_summary['imputation_code'] = impute_summary[
            'imputation_code'].map(lambda x: DataImputationCode(x).name)
        return impute_summary
    def abstinence_cont(self,
                        start_visit,
                        end_visits,
                        abst_var_names='infer',
                        including_end=False,
                        mode="itt"):
        """
        Calculates the continuous abstinence for the time window.

        :param start_visit: the visit where the time window starts, it should be one of the visits in the visit dataset

        :param end_visits: Union[list, tuple, or Any] the visits for the end(s) of the time window,
            a list of the visits or a single visit, which should all belong to the visits in the visit dataset

        :param abst_var_names: Union[str, list], the name(s) of the abstinence variable(s)
            "infer": the default option, the name(s) will be inferred,
            Note that when specified, the number of abstinence variable names should match the number of end visits

        :param including_end: bool, whether you want to include the end visit or not, default=False

        :param mode: Union["itt", "ro"], how to calculate the abstinence, "itt"=intention-to-treat (the default) or
            "ro"=responders-only

        :return: Pandas DataFrame, subject id and abstinence results
        """
        end_visits = AbstinenceCalculator._listize_args(end_visits)
        abst_names = \
            AbstinenceCalculator._infer_abst_var_names(end_visits, abst_var_names, f'{mode}_cont_v{start_visit}')
        self.visit_data._validate_visits([start_visit, *end_visits])
        if len(end_visits) != len(abst_names):
            raise InputArgumentError(
                "The number of abstinence variable names should match the number of visits."
            )

        results = []
        lapses = []
        for subject_id in self.subject_ids:
            result = [subject_id]

            start_dates = self.visit_data.get_visit_dates(subject_id,
                                                          start_visit,
                                                          mode=mode)
            start_date = start_dates[0] if start_dates else pd.NaT
            end_dates = self.visit_data.get_visit_dates(subject_id,
                                                        end_visits,
                                                        int(including_end),
                                                        mode=mode)

            for end_date_i, end_date in enumerate(end_dates):
                abstinent, lapse = self._score_continuous(
                    subject_id, start_date, end_date, mode)
                result.append(abstinent)
                AbstinenceCalculator._append_lapse_as_needed(
                    lapses, lapse, abst_names[end_date_i])

            results.append(result)

        abstinence_df = pd.DataFrame(results,
                                     columns=['id',
                                              *abst_names]).set_index("id")
        lapses_df = pd.DataFrame(lapses, columns=['id', 'date', 'amount', 'abst_name']).\
            sort_values(by=['abst_name', 'id', 'date'])

        return abstinence_df, lapses_df
    def abstinence_prolonged(self,
                             quit_visit,
                             end_visits,
                             lapse_criterion,
                             grace_days=14,
                             abst_var_names='infer',
                             including_end=False,
                             mode="itt"):
        """
        Calculate the prolonged abstinence using the time window

        :param quit_visit: The visit when the subjects are scheduled to quit smoking

        :param end_visits: The end visits for which the abstinence data are to be calculated

        :param lapse_criterion: Union[False, str, list], the criterion for finding a lapse, which is only examined
            in the time window between the date when the grace period is over and the end date
            False: lapse is not allowed
            Use amounts: it must start with a numeric value, and not end with days, such as "5 cigs", "4 drinks"
            Use days: it must end with days, such as "5 days", "7 days"
            Use amounts over time: such as "5 cigs/7 days", the time intervals are rolling windows
            Use days over time: such as "5 days/7 days", the time intervals are rolling windows
            Use multiple criteria: Combinations of any of these in a tuple or list, such as ("5 cigs", "2 days",
                "2 days/7 days", False)

        :param grace_days: The number of days for the grace period following the quit date

        :param abst_var_names: The names of the abstinence variable, by default, the name will be inferred

        :param including_end: Whether you want to include the anchor visit or not, default=False

        :param mode: How you want to calculate the abstinence, "itt"=intention-to-treat (the default) or
            "ro"=responders-only

        :return: Pandas DataFrame with two columns, subject id and abstinence result
        """
        end_visits = AbstinenceCalculator._listize_args(end_visits)
        self.visit_data._validate_visits([quit_visit, *end_visits])

        all_abst_names = list()
        criteria = AbstinenceCalculator._listize_args(lapse_criterion)
        for criterion in criteria:
            if criterion:
                parsed_criterion_len = len([
                    x for parts in criterion.split("/") for x in parts.split()
                ])
                if parsed_criterion_len not in (2, 4):
                    raise InputArgumentError(
                        "When lapse is allowed, you have to specify the criterion for lapses in "
                        "strings, such as '5 cigs', '5 drinks'. To see the full list of supported"
                        "criteria, please refer to the help menu.")
            else:
                assert criterion in (False, )
            formatted_criterion = criterion.replace(" ", "_") if isinstance(
                criterion, str) else criterion
            abst_names = AbstinenceCalculator.\
                _infer_abst_var_names(end_visits, abst_var_names, f'{mode}_prolonged_{formatted_criterion}')
            all_abst_names.extend(abst_names)

        if len(end_visits) * len(criteria) != len(all_abst_names):
            raise InputArgumentError(
                "The number of abstinence variable names should be equal to the multiplication of"
                "the number of lapse criteria and the number of visits.")

        results = []
        lapses = []

        for subject_id in self.subject_ids:
            result = [subject_id]
            start_dates = self.visit_data.get_visit_dates(subject_id,
                                                          quit_visit,
                                                          grace_days,
                                                          mode=mode)
            start_date = start_dates[0] if start_dates else pd.NaT
            end_dates = self.visit_data.get_visit_dates(subject_id,
                                                        end_visits,
                                                        int(including_end),
                                                        mode=mode)

            for criterion_i, criterion in enumerate(criteria):
                for end_date_i, end_date in enumerate(end_dates):
                    abstinent = 0 if mode == 'itt' else np.nan
                    lapse = None

                    if AbstinenceCalculator._validate_dates(
                            subject_id, start_date, end_date):
                        subject_data = self.tlfb_data.get_subject_data(
                            subject_id, start_date, end_date, mode)
                        if not criterion:
                            abstinent, lapse = self._continuous_abst(
                                subject_id, start_date, end_date, mode)
                        else:
                            parsed_criterion = [
                                x for parts in criterion.split("/")
                                for x in parts.split()
                            ]
                            if len(parsed_criterion) == 2:
                                lapse_threshold = float(parsed_criterion[0])
                                lapse_tracking = 0
                                for record in subject_data.itertuples():
                                    if parsed_criterion[-1] != "days":
                                        lapse_tracking += record.amount
                                    else:
                                        lapse_tracking += (
                                            record.amount >
                                            self.tlfb_data.abst_cutoff)
                                    if lapse_tracking >= lapse_threshold:
                                        lapse = record
                                        break
                            else:
                                assert len(parsed_criterion) == 4
                                cutoff_amount, cutoff_unit, window_amount, _ = parsed_criterion
                                cutoff_amount = float(cutoff_amount)
                                window_amount = int(float(window_amount))
                                index_list = \
                                    subject_data.index[subject_data['amount'] > self.tlfb_data.abst_cutoff].tolist()
                                for j in index_list:
                                    one_window = range(j, j + window_amount)
                                    lapse_tracking = 0
                                    for elem_i, elem in enumerate(
                                            one_window, j):
                                        if cutoff_unit == "days":
                                            lapse_tracking += elem in index_list
                                        else:
                                            if elem_i in subject_data.index:
                                                lapse_tracking += subject_data.loc[
                                                    elem_i, "amount"]
                                        if lapse_tracking > cutoff_amount:
                                            if elem_i in subject_data.index:
                                                lapse = subject_data.loc[
                                                    elem_i, :]
                                                break
                                    if lapse is not None:
                                        break

                            days = int((end_date -
                                        start_date).days if self.tlfb_data.
                                       use_raw_date else (end_date -
                                                          start_date))
                            if subject_data['amount'].count() == days:
                                abstinent = int(lapse is None)

                    result.append(abstinent)
                    abst_name = all_abst_names[len(end_dates) * criterion_i +
                                               end_date_i]
                    AbstinenceCalculator._append_lapse_as_needed(
                        lapses, lapse, abst_name)

            results.append(result)

        abstinence_df = pd.DataFrame(results,
                                     columns=['id',
                                              *all_abst_names]).set_index("id")
        lapses_df = pd.DataFrame(lapses, columns=['id', 'date', 'amount', 'abst_name']).\
            sort_values(by=['abst_name', 'id', 'date'])

        return abstinence_df, lapses_df
    def abstinence_pp(self,
                      end_visits,
                      days,
                      abst_var_names='infer',
                      including_end=False,
                      mode="itt"):
        """
        Calculate the point-prevalence abstinence using the end visit's date.

        :param end_visits: The reference visit(s) on which the abstinence is to be calculated

        :param days: The number of days preceding the end visit(s), it can be a single integer, or a list/tuple of days

        :param abst_var_names: The name(s) of the abstinence variable(s), by default, the name(s) will be inferred,
            if not inferred, the number of abstinence variable names should match the number of end visits

        :param including_end: Whether you want to include the anchor visit or not, default=False

        :param mode: How you want to calculate the abstinence, "itt"=intention-to-treat (the default) or
            "ro"=responders-only

        :return: Pandas DataFrame, subject id and abstinence results
        """
        days = AbstinenceCalculator._listize_args(days)
        if any(day < 1 for day in days):
            InputArgumentError(
                "The number of days has to be a positive integer.")
        end_visits = AbstinenceCalculator._listize_args(end_visits)
        self.visit_data._validate_visits(end_visits)

        all_abst_names = list()
        for day in days:
            abst_names = AbstinenceCalculator._infer_abst_var_names(
                end_visits, abst_var_names, f'{mode}_pp{day}')
            all_abst_names.extend(abst_names)

        if len(end_visits) * len(days) != len(all_abst_names):
            raise InputArgumentError(
                "The number of abstinence variable names should be equal to the multiplication of"
                "the number of day conditions and the number of visits.")

        results = []
        lapses = []

        for subject_id in self.subject_ids:
            result = [subject_id]
            end_dates = self.visit_data.get_visit_dates(subject_id,
                                                        end_visits,
                                                        int(including_end),
                                                        mode=mode)

            for day_i, day in enumerate(days):
                for end_date_i, end_date in enumerate(end_dates):
                    start_date = end_date + (timedelta(
                        days=-day) if self.tlfb_data.use_raw_date else -day)
                    abstinent, lapse = self._score_continuous(
                        subject_id, start_date, end_date, mode)
                    result.append(abstinent)
                    abst_name = all_abst_names[len(end_dates) * day_i +
                                               end_date_i]
                    AbstinenceCalculator._append_lapse_as_needed(
                        lapses, lapse, abst_name)

            results.append(result)

        abstinence_df = pd.DataFrame(results,
                                     columns=['id',
                                              *all_abst_names]).set_index("id")
        lapses_df = pd.DataFrame(lapses, columns=['id', 'date', 'amount', 'abst_name']).\
            sort_values(by=['abst_name', 'id', 'date'])

        return abstinence_df, lapses_df
Example #9
0
 def _validate_visits(self, visits_to_check):
     if not set(visits_to_check).issubset(self.visits):
         raise InputArgumentError(
             f"Some visits are not in the visit list {self.visits}. Please check your arguments."
         )
Example #10
0
    def impute_data(self, impute='freq', anchor_visit='infer'):
        """
        Impute any missing visit data.

        :type anchor_visit: object
        :param anchor_visit: The anchor visit, which needs to exist for all subjects and the date of which will be
            used to impute the missing visit dates

        :param impute: Union["freq", "mean", None, dict], how the missing visit data are imputed
        None: no imputation
        "freq": use the most frequent, see below for further clarification
        "mean": use the average, see below for further clarification
            Both imputation methods assume that the visit structure is the same among all subjects. It will first find
            the earliest visit as the anchor date, impute any missing visit dates either using the average or the most
            frequent interval. Please note the anchor dates can't be missing.
        dict: A dictionary object mapping the number of days since the anchor visit

        :return DataFrame
        """

        if impute is None or str(impute).lower() == "none":
            return
        if impute not in ('freq', 'mean') and not isinstance(impute, dict):
            raise InputArgumentError(
                'You can only specify the imputation method to be "freq" or "mean". Alternatively, '
                'you can specify a dictionary object mapping the number of days since the anchor '
                'visit.')

        if 'imputation_code' in self.data.columns:
            self.data = self.data[self.data['imputation_code'] == 0]

        self.data = self.data.sort_values(by=self._index_keys,
                                          ignore_index=True)
        if anchor_visit == 'infer' or anchor_visit is None:
            min_date_indices = self.data.groupby(['id'])['date'].idxmin()
            anchor_visit = self.data.loc[min_date_indices,
                                         'visit'].value_counts().idxmax()

        visit_ids = sorted(self.subject_ids)
        anchor_ids = set(self.data.loc[self.data['visit'] == anchor_visit,
                                       'id'].unique())
        missing_anchor_ids = set(visit_ids) - anchor_ids
        if missing_anchor_ids:
            message = f"Subjects {missing_anchor_ids} are missing anchor visit {anchor_visit}. " \
                      f"There might be problems calculating abstinence data for these subjects."
            _show_warning(message)
        ids_s = pd.Series(visit_ids, name='id')
        anchor_dates = self.data.loc[self.data['visit'] == anchor_visit, ['id', 'date']].\
            rename({'date': 'anchor_date'}, axis=1)
        anchor_df = pd.merge(ids_s, anchor_dates, how='outer', on='id')

        df_anchor = anchor_df.copy()
        df_anchor['visit'] = anchor_visit
        df_anchor['imputed_date'] = df_anchor['date'] = df_anchor[
            'anchor_date']
        imputed_visit_dfs = [df_anchor]

        for visit in self.visits - {anchor_visit}:
            visit_dates = self.data.loc[self.data['visit'] == visit,
                                        ['id', 'date']]
            df_visit = pd.merge(anchor_df, visit_dates, how='outer', on='id')
            days_diff = (df_visit['date'] -
                         df_visit['anchor_date']).map(lambda day_diff: (
                             day_diff.days if self.use_raw_date else day_diff
                         ) if pd.notnull(day_diff) else np.nan)
            if impute == 'freq':
                used_days_diff = days_diff.value_counts().idxmax()
            elif impute == 'mean':
                used_days_diff = int(days_diff.mean())
            else:
                used_days_diff = impute[visit]

            def impute_date(x):
                if pd.notnull(x['date']):
                    return x['date']
                imputed_date = x['anchor_date'] + \
                               (timedelta(days=used_days_diff) if self.use_raw_date else used_days_diff)
                return imputed_date

            df_visit['imputed_date'] = df_visit.apply(impute_date, axis=1)
            df_visit['visit'] = visit
            imputed_visit_dfs.append(df_visit)

        visit_data_imputed = pd.concat(imputed_visit_dfs)
        visit_data_imputed['imputation_code'] = \
            visit_data_imputed['date'].isnull().map(int)
        self.data = visit_data_imputed.drop(['date', 'anchor_date'], axis=1). \
            rename({'imputed_date': 'date'}, axis=1).sort_values(by=self._index_keys, ignore_index=True)
        impute_summary = self.data.groupby(['imputation_code']).size().reset_index(). \
            rename({0: "record_count"}, axis=1)
        impute_summary['imputation_code'] = impute_summary[
            'imputation_code'].map(lambda x: DataImputationCode(x).name)
        return impute_summary