def test_nnd_hotdeck_using_rpy2():
    if rpy2 is None:
        print('rpy2 is absent: skipping test')
        return

    r.data('iris')

    pandas2ri.activate()
    # or explcitly do:
    # iris = pandas2ri.ri2py(r['iris'])

    iris = r['iris']

    # lab = list([1:15, 51:65, 101:115)
    # recipient data.frame
    iris_rec = pd.concat([
        iris.loc[1:15],
        iris.loc[51:65],
        iris.loc[101:115],
        ])
    iris_rec.columns
    del iris_rec["Petal.Width"]

    # donor data.frame
    iris_don = pd.concat([
        iris.loc[16:50],
        iris.loc[66:100],
        iris.loc[116:150],
        ])
    del iris_rec["Petal.Length"]

    # Now iris.rec and iris.don have the variables
    # "Sepal.Length", "Sepal.Width" and "Species"
    # in common.
    # "Petal.Length" is available only in iris.rec
    # "Petal.Width" is available only in iris.don

    # find the closest donors using NND hot deck;
    # distances are computed on "Sepal.Length" and "Sepal.Width"

    x, y = nnd_hotdeck_using_rpy2(
        receiver = iris_rec,
        donor = iris_don,
        donor_classes = 'Species',
        z_variables = "Petal.Width",
        matching_variables = ["Sepal.Length", "Sepal.Width"]
        )
def test_nnd_hotdeck_using_rpy2():
    if rpy2 is None:
        print('rpy2 is absent: skipping test')
        return

    r.data('iris')

    pandas2ri.activate()
    # or explcitly do:
    # iris = pandas2ri.ri2py(r['iris'])

    iris = r['iris']

    # lab = list([1:15, 51:65, 101:115)
    # recipient data.frame
    iris_rec = pd.concat([
        iris.loc[1:15],
        iris.loc[51:65],
        iris.loc[101:115],
        ])
    iris_rec.columns
    del iris_rec["Petal.Width"]

    # donor data.frame
    iris_don = pd.concat([
        iris.loc[16:50],
        iris.loc[66:100],
        iris.loc[116:150],
        ])
    del iris_rec["Petal.Length"]

    # Now iris.rec and iris.don have the variables
    # "Sepal.Length", "Sepal.Width" and "Species"
    # in common.
    # "Petal.Length" is available only in iris.rec
    # "Petal.Width" is available only in iris.don

    # find the closest donors using NND hot deck;
    # distances are computed on "Sepal.Length" and "Sepal.Width"

    x, y = nnd_hotdeck_using_rpy2(
        receiver = iris_rec,
        donor = iris_don,
        donor_classes = 'Species',
        z_variables = "Petal.Width",
        matching_variables = ["Sepal.Length", "Sepal.Width"]
        )
def imputation_loyer(temporary_store=None, year=None):
    assert temporary_store is not None
    assert year is not None

    kind = 'erfs_fpr'
    erf = create_comparable_erf_data_frame(temporary_store=temporary_store,
                                           year=year)
    logement = create_comparable_logement_data_frame(
        temporary_store=temporary_store, year=year)

    logement = logement.loc[logement.lmlm.notnull()].copy()
    log.info("Dropping {} observations form logement".format(
        logement.lmlm.isnull().sum()))

    if kind == 'erfs_fpr':
        allvars = [
            'deci',
            'hnph2',
            'magtr',
            'mcs8',
            'mdiplo',
            'mtybd',
            'statut_occupation',
        ]
    else:
        allvars = [
            'deci',
            'hnph2',
            'iaat_bis',
            'magtr',
            'mcs8',
            'mdiplo',
            'mtybd',
            'statut_occupation',
            'tu99_recoded',
        ]

    # TODO keep the variable indices

    erf = erf[allvars + ['ident', 'wprm']].copy()

    for variable in allvars:
        erf_unique_values = set(erf[variable].unique())
        logement_unique_values = set(logement[variable].unique())
        if not erf_unique_values <= logement_unique_values:
            print('''
                {} span wrong
                erf: {},
                logement: {}
                concerns {} observations
                '''.format(
                variable, erf_unique_values, logement_unique_values,
                erf[variable].isin(erf_unique_values -
                                   logement_unique_values).sum()))

    if kind == 'erfs_fpr':
        log.info("dropping {} erf observations".format(
            len(erf.query('mtybd == 0 | mcs8 == 0'))))
        erf = erf.query('mtybd != 0 & mcs8 != 0').copy()

    else:
        log.info("dropping {} erf observations".format(
            len(erf.query('iaat_bis == 0 | mtybd == 0 | mcs8 == 0'))))
        erf = erf.query('iaat_bis != 0 & mtybd != 0 & mcs8 != 0').copy()

    for variable in allvars:
        erf_unique_values = set(erf[variable].unique())
        logement_unique_values = set(logement[variable].unique())
        assert erf_unique_values <= logement_unique_values

    if kind == 'erfs_fpr':
        classes = "deci"
    else:
        classes = ['deci', 'tu99_recoded']

    matchvars = list(set(allvars) - set(classes))

    fill_erf_nnd, fill_erf_nnd_1 = matching.nnd_hotdeck_using_rpy2(
        receiver=erf,
        donor=logement,
        matching_variables=matchvars,
        z_variables="lmlm",
        donor_classes=classes,
    )

    fill_erf_nnd.rename(columns={'lmlm': 'loyer'}, inplace=True)

    loyers_imputes = fill_erf_nnd[['ident', 'loyer']].copy()
    menages = temporary_store['menages_{}'.format(year)]
    for loyer_var in ['loyer_x', 'loyer_y', 'loyer']:
        if loyer_var in menages.columns:
            del menages[loyer_var]

    menages = menages.merge(loyers_imputes, on='ident', how='left')
    assert 'loyer' in menages.columns, u"La variable loyer n'est pas présente dans menages"

    temporary_store['menages_{}'.format(year)] = menages
    return
def imputation_loyer(temporary_store = None, year = None):
    assert temporary_store is not None
    assert year is not None

    kind = 'erfs_fpr'
    erf = create_comparable_erf_data_frame(temporary_store = temporary_store, year = year)
    logement = create_comparable_logement_data_frame(temporary_store = temporary_store, year = year)

    logement = logement.loc[logement.lmlm.notnull()].copy()
    log.info("Dropping {} observations form logement".format(logement.lmlm.isnull().sum()))

    if kind == 'erfs_fpr':
        allvars = [
            'deci',
            'hnph2',
            'magtr',
            'mcs8',
            'mdiplo',
            'mtybd',
            'statut_occupation',
            ]
    else:
        allvars = [
            'deci',
            'hnph2',
            'iaat_bis',
            'magtr',
            'mcs8',
            'mdiplo',
            'mtybd',
            'statut_occupation',
            'tu99_recoded',
            ]

    # TODO keep the variable indices

    erf = erf[allvars + ['ident', 'wprm']].copy()

    for variable in allvars:
        erf_unique_values = set(erf[variable].unique())
        logement_unique_values = set(logement[variable].unique())
        if not erf_unique_values <= logement_unique_values:
            print '''
{} span wrong
erf: {},
logement: {}
concerns {} observations
'''.format(
                variable,
                erf_unique_values,
                logement_unique_values,
                erf[variable].isin(erf_unique_values - logement_unique_values).sum()
                )

    if kind == 'erfs_fpr':
        log.info("dropping {} erf observations".format(len(erf.query('mtybd == 0 | mcs8 == 0'))))
        erf = erf.query('mtybd != 0 & mcs8 != 0').copy()

    else:
        log.info("dropping {} erf observations".format(len(erf.query('iaat_bis == 0 | mtybd == 0 | mcs8 == 0'))))
        erf = erf.query('iaat_bis != 0 & mtybd != 0 & mcs8 != 0').copy()

    for variable in allvars:
        erf_unique_values = set(erf[variable].unique())
        logement_unique_values = set(logement[variable].unique())
        assert erf_unique_values <= logement_unique_values

    if kind == 'erfs_fpr':
        classes = "deci"
    else:
        classes = ['deci', 'tu99_recoded']

    matchvars = list(set(allvars) - set(classes))

    fill_erf_nnd, fill_erf_nnd_1 = matching.nnd_hotdeck_using_rpy2(
        receiver = erf,
        donor = logement,
        matching_variables = matchvars,
        z_variables = u"lmlm",
        donor_classes = classes,
        )

    fill_erf_nnd.rename(columns = {'lmlm': 'loyer'}, inplace = True)

    loyers_imputes = fill_erf_nnd[['ident', 'loyer']].copy()
    menages = temporary_store['menages_{}'.format(year)]
    for loyer_var in ['loyer_x', 'loyer_y', 'loyer']:
        if loyer_var in menages.columns:
            del menages[loyer_var]

    menages = menages.merge(loyers_imputes, on = 'ident', how = 'left')
    assert 'loyer' in menages.columns, u"La variable loyer n'est pas présente dans menages"

    temporary_store['menages_{}'.format(year)] = menages
    return