def test_outlier_test():
    endog, exog, labels = get_duncan_data()
    ndarray_mod = OLS(endog, exog).fit()
    rstudent = [
        3.1345185839, -2.3970223990, 2.0438046359, -1.9309187757, 1.8870465798,
        -1.7604905300, -1.7040324156, 1.6024285876, -1.4332485037,
        -1.1044851583, 1.0688582315, 1.0185271840, -0.9024219332,
        -0.9023876471, -0.8830953936, 0.8265782334, 0.8089220547, 0.7682770197,
        0.7319491074, -0.6665962829, 0.5227352794, -0.5135016547, 0.5083881518,
        0.4999224372, -0.4980818221, -0.4759717075, -0.4293565820,
        -0.4114056499, -0.3779540862, 0.3556874030, 0.3409200462, 0.3062248646,
        0.3038999429, -0.3030815773, -0.1873387893, 0.1738050251, 0.1424246593,
        -0.1292266025, 0.1272066463, -0.0798902878, 0.0788467222, 0.0722556991,
        0.0505098280, 0.0233215136, 0.0007112055
    ]
    unadj_p = [
        0.003177202, 0.021170298, 0.047432955, 0.060427645, 0.066248120,
        0.085783008, 0.095943909, 0.116738318, 0.159368890, 0.275822623,
        0.291386358, 0.314400295, 0.372104049, 0.372122040, 0.382333561,
        0.413260793, 0.423229432, 0.446725370, 0.468363101, 0.508764039,
        0.603971990, 0.610356737, 0.613905871, 0.619802317, 0.621087703,
        0.636621083, 0.669911674, 0.682917818, 0.707414459, 0.723898263,
        0.734904667, 0.760983108, 0.762741124, 0.763360242, 0.852319039,
        0.862874018, 0.887442197, 0.897810225, 0.899398691, 0.936713197,
        0.937538115, 0.942749758, 0.959961394, 0.981506948, 0.999435989
    ]
    bonf_p = [
        0.1429741, 0.9526634, 2.1344830, 2.7192440, 2.9811654, 3.8602354,
        4.3174759, 5.2532243, 7.1716001, 12.4120180, 13.1123861, 14.1480133,
        16.7446822, 16.7454918, 17.2050103, 18.5967357, 19.0453245, 20.1026416,
        21.0763395, 22.8943818, 27.1787396, 27.4660532, 27.6257642, 27.8911043,
        27.9489466, 28.6479487, 30.1460253, 30.7313018, 31.8336506, 32.5754218,
        33.0707100, 34.2442399, 34.3233506, 34.3512109, 38.3543568, 38.8293308,
        39.9348989, 40.4014601, 40.4729411, 42.1520939, 42.1892152, 42.4237391,
        43.1982627, 44.1678127, 44.9746195
    ]
    bonf_p = np.array(bonf_p)
    bonf_p[bonf_p > 1] = 1
    sorted_labels = [
        "minister", "reporter", "contractor", "insurance.agent", "machinist",
        "store.clerk", "conductor", "factory.owner", "mail.carrier",
        "streetcar.motorman", "carpenter", "coal.miner", "bartender",
        "bookkeeper", "soda.clerk", "chemist", "RR.engineer", "professor",
        "electrician", "gas.stn.attendant", "auto.repairman", "watchman",
        "banker", "machine.operator", "dentist", "waiter", "shoe.shiner",
        "welfare.worker", "plumber", "physician", "pilot", "engineer",
        "accountant", "lawyer", "undertaker", "barber", "store.manager",
        "truck.driver", "cook", "janitor", "policeman", "architect", "teacher",
        "taxi.driver", "author"
    ]

    res2 = np.c_[rstudent, unadj_p, bonf_p]
    res = oi.outlier_test(ndarray_mod, method='b', labels=labels, order=True)
    np.testing.assert_almost_equal(res.values, res2, 7)
    np.testing.assert_equal(res.index.tolist(),
                            sorted_labels)  # pylint: disable-msg=E1103

    data = pd.DataFrame(np.column_stack((endog, exog)),
                        columns='y const var1 var2'.split(),
                        index=labels)

    # check `order` with pandas bug in #3971
    res_pd = OLS.from_formula('y ~ const + var1 + var2 - 0', data).fit()

    res_outl2 = oi.outlier_test(res_pd, method='b', order=True)
    assert_almost_equal(res_outl2.values, res2, 7)
    assert_equal(res_outl2.index.tolist(), sorted_labels)

    res_outl1 = res_pd.outlier_test(method='b')
    res_outl1 = res_outl1.sort_values(['unadj_p'], ascending=True)
    assert_almost_equal(res_outl1.values, res2, 7)
    assert_equal(res_outl1.index.tolist(), sorted_labels)
    assert_array_equal(res_outl2.index, res_outl1.index)

    # additional keywords in method
    res_outl3 = res_pd.outlier_test(method='b', order=True)
    assert_equal(res_outl3.index.tolist(), sorted_labels)
    res_outl4 = res_pd.outlier_test(method='b', order=True, cutoff=0.15)
    assert_equal(res_outl4.index.tolist(), sorted_labels[:1])
Exemple #2
0
def test_outlier_test():
    # results from R with NA -> 1. Just testing interface here because
    # outlier_test is just a wrapper
    labels = [
        "accountant",
        "pilot",
        "architect",
        "author",
        "chemist",
        "minister",
        "professor",
        "dentist",
        "reporter",
        "engineer",
        "undertaker",
        "lawyer",
        "physician",
        "welfare.worker",
        "teacher",
        "conductor",
        "contractor",
        "factory.owner",
        "store.manager",
        "banker",
        "bookkeeper",
        "mail.carrier",
        "insurance.agent",
        "store.clerk",
        "carpenter",
        "electrician",
        "RR.engineer",
        "machinist",
        "auto.repairman",
        "plumber",
        "gas.stn.attendant",
        "coal.miner",
        "streetcar.motorman",
        "taxi.driver",
        "truck.driver",
        "machine.operator",
        "barber",
        "bartender",
        "shoe.shiner",
        "cook",
        "soda.clerk",
        "watchman",
        "janitor",
        "policeman",
        "waiter",
    ]
    # Duncan's prestige data from car
    exog = [
        [1.0, 62.0, 86.0],
        [1.0, 72.0, 76.0],
        [1.0, 75.0, 92.0],
        [1.0, 55.0, 90.0],
        [1.0, 64.0, 86.0],
        [1.0, 21.0, 84.0],
        [1.0, 64.0, 93.0],
        [1.0, 80.0, 100.0],
        [1.0, 67.0, 87.0],
        [1.0, 72.0, 86.0],
        [1.0, 42.0, 74.0],
        [1.0, 76.0, 98.0],
        [1.0, 76.0, 97.0],
        [1.0, 41.0, 84.0],
        [1.0, 48.0, 91.0],
        [1.0, 76.0, 34.0],
        [1.0, 53.0, 45.0],
        [1.0, 60.0, 56.0],
        [1.0, 42.0, 44.0],
        [1.0, 78.0, 82.0],
        [1.0, 29.0, 72.0],
        [1.0, 48.0, 55.0],
        [1.0, 55.0, 71.0],
        [1.0, 29.0, 50.0],
        [1.0, 21.0, 23.0],
        [1.0, 47.0, 39.0],
        [1.0, 81.0, 28.0],
        [1.0, 36.0, 32.0],
        [1.0, 22.0, 22.0],
        [1.0, 44.0, 25.0],
        [1.0, 15.0, 29.0],
        [1.0, 7.0, 7.0],
        [1.0, 42.0, 26.0],
        [1.0, 9.0, 19.0],
        [1.0, 21.0, 15.0],
        [1.0, 21.0, 20.0],
        [1.0, 16.0, 26.0],
        [1.0, 16.0, 28.0],
        [1.0, 9.0, 17.0],
        [1.0, 14.0, 22.0],
        [1.0, 12.0, 30.0],
        [1.0, 17.0, 25.0],
        [1.0, 7.0, 20.0],
        [1.0, 34.0, 47.0],
        [1.0, 8.0, 32.0],
    ]
    endog = [
        82.0,
        83.0,
        90.0,
        76.0,
        90.0,
        87.0,
        93.0,
        90.0,
        52.0,
        88.0,
        57.0,
        89.0,
        97.0,
        59.0,
        73.0,
        38.0,
        76.0,
        81.0,
        45.0,
        92.0,
        39.0,
        34.0,
        41.0,
        16.0,
        33.0,
        53.0,
        67.0,
        57.0,
        26.0,
        29.0,
        10.0,
        15.0,
        19.0,
        10.0,
        13.0,
        24.0,
        20.0,
        7.0,
        3.0,
        16.0,
        6.0,
        11.0,
        8.0,
        41.0,
        10.0,
    ]
    ndarray_mod = OLS(endog, exog).fit()
    rstudent = [
        3.1345185839,
        -2.3970223990,
        2.0438046359,
        -1.9309187757,
        1.8870465798,
        -1.7604905300,
        -1.7040324156,
        1.6024285876,
        -1.4332485037,
        -1.1044851583,
        1.0688582315,
        1.0185271840,
        -0.9024219332,
        -0.9023876471,
        -0.8830953936,
        0.8265782334,
        0.8089220547,
        0.7682770197,
        0.7319491074,
        -0.6665962829,
        0.5227352794,
        -0.5135016547,
        0.5083881518,
        0.4999224372,
        -0.4980818221,
        -0.4759717075,
        -0.4293565820,
        -0.4114056499,
        -0.3779540862,
        0.3556874030,
        0.3409200462,
        0.3062248646,
        0.3038999429,
        -0.3030815773,
        -0.1873387893,
        0.1738050251,
        0.1424246593,
        -0.1292266025,
        0.1272066463,
        -0.0798902878,
        0.0788467222,
        0.0722556991,
        0.0505098280,
        0.0233215136,
        0.0007112055,
    ]
    unadj_p = [
        0.003177202,
        0.021170298,
        0.047432955,
        0.060427645,
        0.066248120,
        0.085783008,
        0.095943909,
        0.116738318,
        0.159368890,
        0.275822623,
        0.291386358,
        0.314400295,
        0.372104049,
        0.372122040,
        0.382333561,
        0.413260793,
        0.423229432,
        0.446725370,
        0.468363101,
        0.508764039,
        0.603971990,
        0.610356737,
        0.613905871,
        0.619802317,
        0.621087703,
        0.636621083,
        0.669911674,
        0.682917818,
        0.707414459,
        0.723898263,
        0.734904667,
        0.760983108,
        0.762741124,
        0.763360242,
        0.852319039,
        0.862874018,
        0.887442197,
        0.897810225,
        0.899398691,
        0.936713197,
        0.937538115,
        0.942749758,
        0.959961394,
        0.981506948,
        0.999435989,
    ]
    bonf_p = [
        0.1429741,
        0.9526634,
        2.1344830,
        2.7192440,
        2.9811654,
        3.8602354,
        4.3174759,
        5.2532243,
        7.1716001,
        12.4120180,
        13.1123861,
        14.1480133,
        16.7446822,
        16.7454918,
        17.2050103,
        18.5967357,
        19.0453245,
        20.1026416,
        21.0763395,
        22.8943818,
        27.1787396,
        27.4660532,
        27.6257642,
        27.8911043,
        27.9489466,
        28.6479487,
        30.1460253,
        30.7313018,
        31.8336506,
        32.5754218,
        33.0707100,
        34.2442399,
        34.3233506,
        34.3512109,
        38.3543568,
        38.8293308,
        39.9348989,
        40.4014601,
        40.4729411,
        42.1520939,
        42.1892152,
        42.4237391,
        43.1982627,
        44.1678127,
        44.9746195,
    ]
    bonf_p = np.array(bonf_p)
    bonf_p[bonf_p > 1] = 1
    sorted_labels = [
        "minister",
        "reporter",
        "contractor",
        "insurance.agent",
        "machinist",
        "store.clerk",
        "conductor",
        "factory.owner",
        "mail.carrier",
        "streetcar.motorman",
        "carpenter",
        "coal.miner",
        "bartender",
        "bookkeeper",
        "soda.clerk",
        "chemist",
        "RR.engineer",
        "professor",
        "electrician",
        "gas.stn.attendant",
        "auto.repairman",
        "watchman",
        "banker",
        "machine.operator",
        "dentist",
        "waiter",
        "shoe.shiner",
        "welfare.worker",
        "plumber",
        "physician",
        "pilot",
        "engineer",
        "accountant",
        "lawyer",
        "undertaker",
        "barber",
        "store.manager",
        "truck.driver",
        "cook",
        "janitor",
        "policeman",
        "architect",
        "teacher",
        "taxi.driver",
        "author",
    ]

    res2 = np.c_[rstudent, unadj_p, bonf_p]
    res = oi.outlier_test(ndarray_mod, method="b", labels=labels, order=True)
    np.testing.assert_almost_equal(res.values, res2, 7)
    np.testing.assert_equal(res.index.tolist(), sorted_labels)  # pylint: disable-msg=E1103
def test_outlier_test():
    endog, exog, labels = get_duncan_data()
    ndarray_mod = OLS(endog, exog).fit()
    rstudent =  [3.1345185839, -2.3970223990,  2.0438046359, -1.9309187757,
                 1.8870465798, -1.7604905300, -1.7040324156,  1.6024285876,
                 -1.4332485037, -1.1044851583,  1.0688582315,  1.0185271840,
                 -0.9024219332, -0.9023876471, -0.8830953936,  0.8265782334,
                 0.8089220547,  0.7682770197,  0.7319491074, -0.6665962829,
                 0.5227352794, -0.5135016547,  0.5083881518,  0.4999224372,
                 -0.4980818221, -0.4759717075, -0.4293565820, -0.4114056499,
                 -0.3779540862,  0.3556874030,  0.3409200462,  0.3062248646,
                 0.3038999429, -0.3030815773, -0.1873387893,  0.1738050251,
                 0.1424246593, -0.1292266025,  0.1272066463, -0.0798902878,
                 0.0788467222,  0.0722556991,  0.0505098280,  0.0233215136,
                 0.0007112055]
    unadj_p = [0.003177202, 0.021170298, 0.047432955, 0.060427645, 0.066248120,
               0.085783008, 0.095943909, 0.116738318, 0.159368890, 0.275822623,
               0.291386358, 0.314400295, 0.372104049, 0.372122040, 0.382333561,
               0.413260793, 0.423229432, 0.446725370, 0.468363101, 0.508764039,
               0.603971990, 0.610356737, 0.613905871, 0.619802317, 0.621087703,
               0.636621083, 0.669911674, 0.682917818, 0.707414459, 0.723898263,
               0.734904667, 0.760983108, 0.762741124, 0.763360242, 0.852319039,
               0.862874018, 0.887442197, 0.897810225, 0.899398691, 0.936713197,
               0.937538115, 0.942749758, 0.959961394, 0.981506948, 0.999435989]
    bonf_p = [0.1429741, 0.9526634, 2.1344830, 2.7192440, 2.9811654, 3.8602354,
            4.3174759, 5.2532243, 7.1716001, 12.4120180, 13.1123861, 14.1480133,
            16.7446822, 16.7454918, 17.2050103, 18.5967357, 19.0453245,
            20.1026416, 21.0763395, 22.8943818, 27.1787396, 27.4660532,
            27.6257642, 27.8911043, 27.9489466, 28.6479487, 30.1460253,
            30.7313018, 31.8336506, 32.5754218, 33.0707100, 34.2442399,
            34.3233506, 34.3512109, 38.3543568, 38.8293308, 39.9348989,
            40.4014601, 40.4729411, 42.1520939, 42.1892152, 42.4237391,
            43.1982627, 44.1678127, 44.9746195]
    bonf_p = np.array(bonf_p)
    bonf_p[bonf_p > 1] = 1
    sorted_labels = ["minister", "reporter", "contractor", "insurance.agent",
            "machinist", "store.clerk", "conductor", "factory.owner",
            "mail.carrier", "streetcar.motorman", "carpenter", "coal.miner",
            "bartender", "bookkeeper", "soda.clerk", "chemist", "RR.engineer",
            "professor", "electrician", "gas.stn.attendant", "auto.repairman",
            "watchman", "banker", "machine.operator", "dentist", "waiter",
            "shoe.shiner", "welfare.worker", "plumber", "physician", "pilot",
            "engineer", "accountant", "lawyer", "undertaker", "barber",
            "store.manager", "truck.driver", "cook", "janitor", "policeman",
            "architect", "teacher", "taxi.driver", "author"]

    res2 = np.c_[rstudent, unadj_p, bonf_p]
    res = oi.outlier_test(ndarray_mod, method='b', labels=labels, order=True)
    np.testing.assert_almost_equal(res.values, res2, 7)
    np.testing.assert_equal(res.index.tolist(), sorted_labels)  # pylint: disable-msg=E1103

    data = pd.DataFrame(np.column_stack((endog, exog)),
                        columns='y const var1 var2'.split(),
                        index=labels)

    # check `order` with pandas bug in #3971
    res_pd = OLS.from_formula('y ~ const + var1 + var2 - 0', data).fit()

    res_outl2 = oi.outlier_test(res_pd, method='b', order=True)
    assert_almost_equal(res_outl2.values, res2, 7)
    assert_equal(res_outl2.index.tolist(), sorted_labels)

    res_outl1 = res_pd.outlier_test(method='b')
    res_outl1 = res_outl1.sort_values(['unadj_p'], ascending=True)
    assert_almost_equal(res_outl1.values, res2, 7)
    assert_equal(res_outl1.index.tolist(), sorted_labels)
    assert_array_equal(res_outl2.index, res_outl1.index)

    # additional keywords in method
    res_outl3 = res_pd.outlier_test(method='b', order=True)
    assert_equal(res_outl3.index.tolist(), sorted_labels)
    res_outl4 = res_pd.outlier_test(method='b', order=True, cutoff=0.15)
    assert_equal(res_outl4.index.tolist(), sorted_labels[:1])
def test_outlier_test():
    # results from R with NA -> 1. Just testing interface here because
    # outlier_test is just a wrapper
    labels = [
        'accountant', 'pilot', 'architect', 'author', 'chemist', 'minister',
        'professor', 'dentist', 'reporter', 'engineer', 'undertaker', 'lawyer',
        'physician', 'welfare.worker', 'teacher', 'conductor', 'contractor',
        'factory.owner', 'store.manager', 'banker', 'bookkeeper',
        'mail.carrier', 'insurance.agent', 'store.clerk', 'carpenter',
        'electrician', 'RR.engineer', 'machinist', 'auto.repairman', 'plumber',
        'gas.stn.attendant', 'coal.miner', 'streetcar.motorman', 'taxi.driver',
        'truck.driver', 'machine.operator', 'barber', 'bartender',
        'shoe.shiner', 'cook', 'soda.clerk', 'watchman', 'janitor',
        'policeman', 'waiter'
    ]
    #Duncan's prestige data from car
    exog = [[1.0, 62.0, 86.0], [1.0, 72.0, 76.0], [1.0, 75.0, 92.0],
            [1.0, 55.0, 90.0], [1.0, 64.0, 86.0], [1.0, 21.0, 84.0],
            [1.0, 64.0, 93.0], [1.0, 80.0, 100.0], [1.0, 67.0, 87.0],
            [1.0, 72.0, 86.0], [1.0, 42.0, 74.0], [1.0, 76.0, 98.0],
            [1.0, 76.0, 97.0], [1.0, 41.0, 84.0], [1.0, 48.0, 91.0],
            [1.0, 76.0, 34.0], [1.0, 53.0, 45.0], [1.0, 60.0, 56.0],
            [1.0, 42.0, 44.0], [1.0, 78.0, 82.0], [1.0, 29.0, 72.0],
            [1.0, 48.0, 55.0], [1.0, 55.0, 71.0], [1.0, 29.0, 50.0],
            [1.0, 21.0, 23.0], [1.0, 47.0, 39.0], [1.0, 81.0, 28.0],
            [1.0, 36.0, 32.0], [1.0, 22.0, 22.0], [1.0, 44.0, 25.0],
            [1.0, 15.0, 29.0], [1.0, 7.0, 7.0], [1.0, 42.0, 26.0],
            [1.0, 9.0, 19.0], [1.0, 21.0, 15.0], [1.0, 21.0, 20.0],
            [1.0, 16.0, 26.0], [1.0, 16.0, 28.0], [1.0, 9.0, 17.0],
            [1.0, 14.0, 22.0], [1.0, 12.0, 30.0], [1.0, 17.0, 25.0],
            [1.0, 7.0, 20.0], [1.0, 34.0, 47.0], [1.0, 8.0, 32.0]]
    endog = [
        82., 83., 90., 76., 90., 87., 93., 90., 52., 88., 57., 89., 97., 59.,
        73., 38., 76., 81., 45., 92., 39., 34., 41., 16., 33., 53., 67., 57.,
        26., 29., 10., 15., 19., 10., 13., 24., 20., 7., 3., 16., 6., 11., 8.,
        41., 10.
    ]
    ndarray_mod = OLS(endog, exog).fit()
    rstudent = [
        3.1345185839, -2.3970223990, 2.0438046359, -1.9309187757, 1.8870465798,
        -1.7604905300, -1.7040324156, 1.6024285876, -1.4332485037,
        -1.1044851583, 1.0688582315, 1.0185271840, -0.9024219332,
        -0.9023876471, -0.8830953936, 0.8265782334, 0.8089220547, 0.7682770197,
        0.7319491074, -0.6665962829, 0.5227352794, -0.5135016547, 0.5083881518,
        0.4999224372, -0.4980818221, -0.4759717075, -0.4293565820,
        -0.4114056499, -0.3779540862, 0.3556874030, 0.3409200462, 0.3062248646,
        0.3038999429, -0.3030815773, -0.1873387893, 0.1738050251, 0.1424246593,
        -0.1292266025, 0.1272066463, -0.0798902878, 0.0788467222, 0.0722556991,
        0.0505098280, 0.0233215136, 0.0007112055
    ]
    unadj_p = [
        0.003177202, 0.021170298, 0.047432955, 0.060427645, 0.066248120,
        0.085783008, 0.095943909, 0.116738318, 0.159368890, 0.275822623,
        0.291386358, 0.314400295, 0.372104049, 0.372122040, 0.382333561,
        0.413260793, 0.423229432, 0.446725370, 0.468363101, 0.508764039,
        0.603971990, 0.610356737, 0.613905871, 0.619802317, 0.621087703,
        0.636621083, 0.669911674, 0.682917818, 0.707414459, 0.723898263,
        0.734904667, 0.760983108, 0.762741124, 0.763360242, 0.852319039,
        0.862874018, 0.887442197, 0.897810225, 0.899398691, 0.936713197,
        0.937538115, 0.942749758, 0.959961394, 0.981506948, 0.999435989
    ]
    bonf_p = [
        0.1429741, 0.9526634, 2.1344830, 2.7192440, 2.9811654, 3.8602354,
        4.3174759, 5.2532243, 7.1716001, 12.4120180, 13.1123861, 14.1480133,
        16.7446822, 16.7454918, 17.2050103, 18.5967357, 19.0453245, 20.1026416,
        21.0763395, 22.8943818, 27.1787396, 27.4660532, 27.6257642, 27.8911043,
        27.9489466, 28.6479487, 30.1460253, 30.7313018, 31.8336506, 32.5754218,
        33.0707100, 34.2442399, 34.3233506, 34.3512109, 38.3543568, 38.8293308,
        39.9348989, 40.4014601, 40.4729411, 42.1520939, 42.1892152, 42.4237391,
        43.1982627, 44.1678127, 44.9746195
    ]
    bonf_p = np.array(bonf_p)
    bonf_p[bonf_p > 1] = 1
    sorted_labels = [
        "minister", "reporter", "contractor", "insurance.agent", "machinist",
        "store.clerk", "conductor", "factory.owner", "mail.carrier",
        "streetcar.motorman", "carpenter", "coal.miner", "bartender",
        "bookkeeper", "soda.clerk", "chemist", "RR.engineer", "professor",
        "electrician", "gas.stn.attendant", "auto.repairman", "watchman",
        "banker", "machine.operator", "dentist", "waiter", "shoe.shiner",
        "welfare.worker", "plumber", "physician", "pilot", "engineer",
        "accountant", "lawyer", "undertaker", "barber", "store.manager",
        "truck.driver", "cook", "janitor", "policeman", "architect", "teacher",
        "taxi.driver", "author"
    ]

    res2 = np.c_[rstudent, unadj_p, bonf_p]
    res = oi.outlier_test(ndarray_mod, method='b', labels=labels, order=True)
    np.testing.assert_almost_equal(res.values, res2, 7)
    np.testing.assert_equal(res.index.tolist(),
                            sorted_labels)  # pylint: disable-msg=E1103