def test_replace_missing_vals(self): M = np.array([('a', 0, 0.0, 0.1), ('b', 1, 1.0, np.nan), ('', -999, np.nan, 0.0), ('d', 1, np.nan, 0.2), ('', -999, 2.0, np.nan)], dtype=[('str', 'O'), ('int', int), ('float1', float), ('float2', float)]) ctrl = M.copy() ctrl['float1'] = np.array([0.0, 1.0, -1.0, -1.0, 2.0]) ctrl['float2'] = np.array([0.1, -1.0, 0.0, 0.2, -1.0]) res = replace_missing_vals(M, 'constant', constant=-1.0) self.assertTrue(np.array_equal(ctrl, res)) ctrl = M.copy() ctrl['int'] = np.array([100, 1, -999, 1, -999]) ctrl['float1'] = np.array([100, 1.0, np.nan, np.nan, 2.0]) ctrl['float2'] = np.array([0.1, np.nan, 100, 0.2, np.nan]) res = replace_missing_vals(M, 'constant', missing_val=0, constant=100) self.assertTrue(utils_for_tests.array_equal(ctrl, res)) ctrl = M.copy() ctrl['int'] = np.array([0, 1, 1, 1, 1]) res = replace_missing_vals(M, 'most_frequent', missing_val=-999) self.assertTrue(utils_for_tests.array_equal(ctrl, res)) ctrl = M.copy() ctrl['float1'] = np.array([0.0, 1.0, 1.0, 1.0, 2.0]) ctrl['float2'] = np.array([0.1, 0.1, 0.0, 0.2, 0.1]) res = replace_missing_vals(M, 'mean', missing_val=np.nan) self.assertTrue(utils_for_tests.array_equal(ctrl, res))
def test_describe_cols(self): test_list = [[1, 2],[2, 3],[3, 4],[4, 5],[5, 6],[6, 7]] test_nd = np.array(test_list) test_sa = np.array([(1, 2, 'a'), (2, 3, 'b'), (3, 4, 'c'), (4, 5, 'd'), (5, 6, 'e'), (6, 7, 'f')], dtype=[('id', int), ('val', float), ('name', 'S1')]) ctrl_list = np.array([('f0', 6, 3.5, 1.707825127659933, 1, 6), ('f1', 6, 4.5, 1.707825127659933, 2, 7)], dtype=[('Column Name', 'S2'), ('Count', int), ('Mean', float), ('Standard Dev', float), ('Minimum', int), ('Maximum', int)]) ctrl_printout = """ Column Name Count Mean Standard Dev Minimum Maximum 0 f0 6 3.5 1.70782512766 1 6 1 f1 6 4.5 1.70782512766 2 7 """.strip() with uft.rerout_stdout() as get_stdout: self.assertTrue(uft.array_equal(ctrl_list, describe_cols( test_list))) self.assertEqual(get_stdout().strip(), ctrl_printout) self.assertTrue(uft.array_equal(ctrl_list, describe_cols( test_nd, verbose=False))) ctrl_sa = np.array([('id', 6, 3.5, 1.707825127659933, 1, 6), ('val', 6, 4.5, 1.707825127659933, 2, 7), ('name', np.nan, np.nan, np.nan, np.nan, np.nan)], dtype=[('Column Name', 'S4'), ('Count', float), ('Mean', float), ('Standard Dev', float), ('Minimum', float), ('Maximum', float)]) self.assertTrue(uft.array_equal(ctrl_sa, describe_cols( test_sa, verbose=False)))
def test_array_emitter(self): db_file = uft.path_of_data('rg_complex_dates.db') ae = array_emitter.ArrayEmitter(convert_to_unix_time=True) ae = ae.set_aggregation('bounded', 'SUM') ae = ae.set_aggregation('no_start', 'SUM') ae = ae.set_aggregation('no_stop', 'SUM') ae = ae.set_aggregation('unbounded', 'SUM') ae = ae.get_rg_from_sql(self.conn_str, 'rg_complex_dates', feature_col='feature') res1 = ae.set_interval(datetime(2010, 1, 1), datetime(2010, 6, 30)).emit_M() res2 = ae.set_interval(datetime(2010, 7, 1), datetime(2010, 12, 31)).emit_M() res3 = ae.set_interval(datetime(2010, 1, 1), datetime(2010, 12, 31)).emit_M() ctrl_dtype = [('id', '<i8'), ('bounded_sum', '<f8'), ('no_start_sum', '<f8'), ('no_stop_sum', '<f8'), ('unbounded_sum', '<f8')] ctrl1_dat = [(0, 1.0, 100.0, 100000.0, 1000000.0), (1, 0.01, 0.001, 1e-06, 1e-07), (2, np.nan, np.nan, np.nan, 2e-08)] ctrl2_dat = [(0, 10.0, 1000.0, 10000.0, 1000000.0), (1, 0.1, 0.0001, 1e-05, 1e-07), (2, np.nan, np.nan, np.nan, 2e-08)] ctrl3_dat = [(0, 11.0, 1100.0, 110000.0, 1000000.0), (1, 0.11, 0.0011, 1.1e-05, 1e-07), (2, np.nan, np.nan, np.nan, 2e-08)] for res, ctrl_dat in zip((res1, res2, res3), (ctrl1_dat, ctrl2_dat, ctrl3_dat)): self.assertTrue( uft.array_equal(res, np.array(ctrl_dat, dtype=ctrl_dtype), idx_col='id'))
def test_multiple_aggr(self): db_file = uft.path_of_data('rg_students.db') conn_str = 'sqlite:///{}'.format(db_file) ae = array_emitter.ArrayEmitter() ae = ae.get_rg_from_sql(conn_str, 'rg_students') ae = ae.set_default_aggregation(['AVG', 'MIN', 'MAX', 'COUNT']) ae = ae.set_aggregation('absences', ['MIN', 'MAX']) ae = ae.set_aggregation('graduated', 'MAX') ae = ae.set_interval(2005, 2007) ae = ae.set_label_feature('graduated') ae = ae.set_label_interval(2009, 2009) res = ae.emit_M() ctrl = np.array( [(0, 2.2, 2.1, 2.3, 2, 3.95, 3.9, 4.0, 2, 7.0, 8.0, 1.0), (1, 3.45, 3.4, 3.5, 2, np.nan, np.nan, np.nan, np.nan, 0.0, 0.0, 0.0), (2, 3.4, 3.4, 3.4, 1.0, np.nan, np.nan, np.nan, np.nan, 14.0, 96.0, np.nan)], dtype=[('id', '<i8'), ('math_gpa_AVG', '<f8'), ('math_gpa_MIN', '<f8'), ('math_gpa_MAX', '<f8'), ('math_gpa_COUNT', '<i8'), ('english_gpa_AVG', '<f8'), ('english_gpa_MIN', '<f8'), ('english_gpa_MAX', '<f8'), ('english_gpa_COUNT', '<f8'), ('absences_MIN', '<f8'), ('absences_MAX', '<f8'), ('graduated_MAX', '<f8')]) self.assertTrue(uft.array_equal(res, ctrl))
def test_cast_list_of_list_to_sa(self): L = [[None, None, None], ['a', 5, None], ['ab', 'x', None]] ctrl = np.array( [('', '', ''), ('a', '5', ''), ('ab', 'x', '')], dtype=[('f0', 'S2'), ('f1', 'S1'), ('f2', 'S1')]) conv = utils.cast_list_of_list_to_sa(L) self.assertTrue(np.array_equal(conv, ctrl)) L = [[None, u'\u05dd\u05d5\u05dc\u05e9', 4.0, 7], [2, 'hello', np.nan, None], [4, None, None, 14L]] ctrl = np.array( [(-999, u'\u05dd\u05d5\u05dc\u05e9', 4.0, 7), (2, u'hello', np.nan, -999L), (4, u'', np.nan, 14L)], dtype=[('int', int), ('ucode', 'U5'), ('float', float), ('long', long)]) conv = utils.cast_list_of_list_to_sa( L, col_names=['int', 'ucode', 'float', 'long']) self.assertTrue(utils_for_tests.array_equal(ctrl, conv))
def test_from_csv(self): db_file = uft.path_of_data('rg_complex_dates.csv') ae = array_emitter.ArrayEmitter() ae = ae.set_aggregation('bounded', 'SUM') ae = ae.set_aggregation('no_start', 'SUM') ae = ae.set_aggregation('no_stop', 'SUM') ae = ae.set_aggregation('unbounded', 'SUM') ae = ae.get_rg_from_csv(db_file, feature_col='feature', parse_datetimes=['start', 'stop']) res1 = ae.set_interval(datetime(2010, 1, 1), datetime(2010, 6, 30)).emit_M() res2 = ae.set_interval(datetime(2010, 7, 1), datetime(2010, 12, 31)).emit_M() res3 = ae.set_interval(datetime(2010, 1, 1), datetime(2010, 12, 31)).emit_M() ctrl_dtype = [('id', '<i8'), ('bounded_SUM', '<f8'), ('no_start_SUM', '<f8'), ('no_stop_SUM', '<f8'), ('unbounded_SUM', '<f8')] ctrl1_dat = [(0, 1.0, 100.0, 100000.0, 1000000.0), (1, 0.01, 0.001, 1e-06, 1e-07), (2, np.nan, np.nan, np.nan, 2e-08)] ctrl2_dat = [(0, 10.0, 1000.0, 10000.0, 1000000.0), (1, 0.1, 0.0001, 1e-05, 1e-07), (2, np.nan, np.nan, np.nan, 2e-08)] ctrl3_dat = [(0, 11.0, 1100.0, 110000.0, 1000000.0), (1, 0.11, 0.0011, 1.1e-05, 1e-07), (2, np.nan, np.nan, np.nan, 2e-08)] for res, ctrl_dat in zip((res1, res2, res3), (ctrl1_dat, ctrl2_dat, ctrl3_dat)): self.assertTrue( uft.array_equal(res, np.array(ctrl_dat, dtype=ctrl_dtype)))
def test_from_csv(self): db_file = uft.path_of_data('rg_complex_dates.csv') ae = array_emitter.ArrayEmitter() ae = ae.set_aggregation('bounded', 'SUM') ae = ae.set_aggregation('no_start', 'SUM') ae = ae.set_aggregation('no_stop', 'SUM') ae = ae.set_aggregation('unbounded', 'SUM') ae = ae.get_rg_from_csv(db_file, feature_col='feature', parse_datetimes=['start', 'stop']) res1 = ae.set_interval( datetime(2010, 1, 1), datetime(2010, 6, 30)).emit_M() res2 = ae.set_interval( datetime(2010, 7, 1), datetime(2010, 12, 31)).emit_M() res3 = ae.set_interval( datetime(2010, 1, 1), datetime(2010, 12, 31)).emit_M() ctrl_dtype = [('id', '<i8'), ('bounded', '<f8'), ('no_start', '<f8'), ('no_stop', '<f8'), ('unbounded', '<f8')] ctrl1_dat = [(0, 1.0, 100.0, 100000.0, 1000000.0), (1, 0.01, 0.001, 1e-06, 1e-07), (2, np.nan, np.nan, np.nan, 2e-08)] ctrl2_dat = [(0, 10.0, 1000.0, 10000.0, 1000000.0), (1, 0.1, 0.0001, 1e-05, 1e-07), (2, np.nan, np.nan, np.nan, 2e-08)] ctrl3_dat = [(0, 11.0, 1100.0, 110000.0, 1000000.0), (1, 0.11, 0.0011, 1.1e-05, 1e-07), (2, np.nan, np.nan, np.nan, 2e-08)] for res, ctrl_dat in zip((res1, res2, res3), (ctrl1_dat, ctrl2_dat, ctrl3_dat)): self.assertTrue(uft.array_equal( res, np.array(ctrl_dat, dtype=ctrl_dtype)))
def test_array_emitter(self): db_file = uft.path_of_data('rg_complex_dates.db') ae = array_emitter.ArrayEmitter(convert_to_unix_time=True) ae = ae.set_aggregation('bounded', 'SUM') ae = ae.set_aggregation('no_start', 'SUM') ae = ae.set_aggregation('no_stop', 'SUM') ae = ae.set_aggregation('unbounded', 'SUM') ae = ae.get_rg_from_sql(self.conn_str, 'rg_complex_dates', feature_col='feature') res1 = ae.set_interval( datetime(2010, 1, 1), datetime(2010, 6, 30)).emit_M() res2 = ae.set_interval( datetime(2010, 7, 1), datetime(2010, 12, 31)).emit_M() res3 = ae.set_interval( datetime(2010, 1, 1), datetime(2010, 12, 31)).emit_M() ctrl_dtype = [('id', '<i8'), ('bounded_sum', '<f8'), ('no_start_sum', '<f8'), ('no_stop_sum', '<f8'), ('unbounded_sum', '<f8')] ctrl1_dat = [(0, 1.0, 100.0, 100000.0, 1000000.0), (1, 0.01, 0.001, 1e-06, 1e-07), (2, np.nan, np.nan, np.nan, 2e-08)] ctrl2_dat = [(0, 10.0, 1000.0, 10000.0, 1000000.0), (1, 0.1, 0.0001, 1e-05, 1e-07), (2, np.nan, np.nan, np.nan, 2e-08)] ctrl3_dat = [(0, 11.0, 1100.0, 110000.0, 1000000.0), (1, 0.11, 0.0011, 1.1e-05, 1e-07), (2, np.nan, np.nan, np.nan, 2e-08)] for res, ctrl_dat in zip((res1, res2, res3), (ctrl1_dat, ctrl2_dat, ctrl3_dat)): self.assertTrue(uft.array_equal( res, np.array(ctrl_dat, dtype=ctrl_dtype), idx_col='id'))
def test_cast_list_of_list_to_sa2(self): L = [[None, None, None], ['a', 5, None], ['ab', 'x', None]] ctrl = np.array( [('', '', ''), ('a', '5', ''), ('ab', 'x', '')], dtype=[('f0', 'S2'), ('f1', 'S1'), ('f2', 'S1')]) conv = utils.cast_list_of_list_to_sa(L) self.assertTrue(np.array_equal(conv, ctrl)) L = [[None, u'\u05dd\u05d5\u05dc\u05e9', 4.0, 7], [2, 'hello', np.nan, None], [4, None, None, 14L]] ctrl = np.array( [(-999, u'\u05dd\u05d5\u05dc\u05e9', 4.0, 7), (2, u'hello', np.nan, -999L), (4, u'', np.nan, 14L)], dtype=[('int', int), ('ucode', 'U5'), ('float', float), ('long', long)]) conv = utils.cast_list_of_list_to_sa( L, col_names=['int', 'ucode', 'float', 'long']) self.assertTrue(uft.array_equal(ctrl, conv))
def test_convert_to_sa(self): # already a structured array sa = np.array([(1, 1.0, 'a', datetime(2015, 01, 01)), (2, 2.0, 'b', datetime(2016, 01, 01))], dtype=[('int', int), ('float', float), ('str', 'S1'), ('date', 'M8[s]')]) self.assertTrue(np.array_equal(sa, utils.convert_to_sa(sa))) # homogeneous array no col names provided nd = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) ctrl = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], dtype=[('f0', int), ('f1', int), ('f2', int)]) self.assertTrue(np.array_equal(ctrl, utils.convert_to_sa(nd))) # homogeneous array with col names provided nd = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) ctrl = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], dtype=[('i0', int), ('i1', int), ('i2', int)]) self.assertTrue(np.array_equal(ctrl, utils.convert_to_sa( nd, col_names=['i0', 'i1', 'i2']))) # list of lists no col name provided lol = [[1, 1, None], ['abc', 2, 3.4]] ctrl = np.array([('1', 1, np.nan), ('abc', 2, 3.4)], dtype=[('f0', 'S3'), ('f1', int), ('f2', float)]) res = utils.convert_to_sa(lol) self.assertTrue(utils_for_tests.array_equal(ctrl, res)) # list of lists with col name provided lol = [['hello', 1.2, datetime(2012, 1, 1), None], [1.3, np.nan, None, '2013-01-01'], [1.4, 1.5, '2014-01-01', 'NO_SUCH_RECORD']] ctrl = np.array([('hello', 1.2, datetime(2012, 1, 1), utils.NOT_A_TIME), ('1.3', np.nan, utils.NOT_A_TIME, datetime(2013, 1, 1)), ('1.4', 1.5, datetime(2014, 1, 1), utils.NOT_A_TIME)], dtype=[('i0', 'S5'), ('i1', float), ('i2', 'M8[us]'), ('i3', 'M8[us]')]) res = utils.convert_to_sa(lol, col_names = ['i0', 'i1', 'i2', 'i3']) self.assertTrue(utils_for_tests.array_equal(ctrl, res))
def test_convert_to_sa(self): # already a structured array sa = np.array([(1, 1.0, 'a', datetime(2015, 01, 01)), (2, 2.0, 'b', datetime(2016, 01, 01))], dtype=[('int', int), ('float', float), ('str', 'O'), ('date', 'M8[s]')]) self.assertTrue(np.array_equal(sa, utils.convert_to_sa(sa))) # homogeneous array no col names provided nd = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) ctrl = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], dtype=[('f0', int), ('f1', int), ('f2', int)]) self.assertTrue(np.array_equal(ctrl, utils.convert_to_sa(nd))) # homogeneous array with col names provided nd = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) ctrl = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], dtype=[('i0', int), ('i1', int), ('i2', int)]) self.assertTrue(np.array_equal(ctrl, utils.convert_to_sa( nd, col_names=['i0', 'i1', 'i2']))) # list of lists no col name provided lol = [[1, 1, None], ['abc', 2, 3.4]] ctrl = np.array([('1', 1, np.nan), ('abc', 2, 3.4)], dtype=[('f0', 'S3'), ('f1', int), ('f2', float)]) res = utils.convert_to_sa(lol) self.assertTrue(uft.array_equal(ctrl, res)) # list of lists with col name provided lol = [['hello', 1.2, datetime(2012, 1, 1), None], [1.3, np.nan, None, '2013-01-01'], [1.4, 1.5, '2014-01-01', 'NO_SUCH_RECORD']] ctrl = np.array([('hello', 1.2, datetime(2012, 1, 1), utils.NOT_A_TIME), ('1.3', np.nan, utils.NOT_A_TIME, datetime(2013, 1, 1)), ('1.4', 1.5, datetime(2014, 1, 1), utils.NOT_A_TIME)], dtype=[('i0', 'S5'), ('i1', float), ('i2', 'M8[us]'), ('i3', 'M8[us]')]) res = utils.convert_to_sa(lol, col_names = ['i0', 'i1', 'i2', 'i3']) self.assertTrue(uft.array_equal(ctrl, res))
def test_get_top_features(self): M, labels = uft.generate_test_matrix(1000, 15, random_state=0) M = utils.cast_np_sa_to_nd(M) M_train, M_test, labels_train, labels_test = train_test_split( M, labels) clf = RandomForestClassifier(random_state=0) clf.fit(M_train, labels_train) ctrl_feat_importances = clf.feature_importances_ ctrl_col_names = ['f{}'.format(i) for i in xrange(15)] ctrl_feat_ranks = np.argsort(ctrl_feat_importances)[::-1][:10] ctrl = utils.convert_to_sa( zip(ctrl_col_names, ctrl_feat_importances), col_names=('feat_name', 'score'))[ctrl_feat_ranks] res = dsp.get_top_features(clf, M, verbose=False) self.assertTrue(uft.array_equal(ctrl, res)) res = dsp.get_top_features(clf, col_names=['f{}'.format(i) for i in xrange(15)], verbose=False) self.assertTrue(uft.array_equal(ctrl, res))
def test_describe_cols(self): test_list = [[1, 2],[2, 3],[3, 4],[4, 5],[5, 6],[6, 7]] test_nd = np.array(test_list) test_sa = np.array([(1, 2, 'a'), (2, 3, 'b'), (3, 4, 'c'), (4, 5, 'd'), (5, 6, 'e'), (6, 7, 'f')], dtype=[('id', int), ('val', float), ('name', 'S1')]) ctrl_list = np.array([('f0', 6, 3.5, 1.707825127659933, 1, 6), ('f1', 6, 4.5, 1.707825127659933, 2, 7)], dtype=[('Column Name', 'S2'), ('Count', int), ('Mean', float), ('Standard Dev', float), ('Minimum', int), ('Maximum', int)]) self.assertTrue(utils_for_tests.array_equal(ctrl_list, describe_cols(test_list))) self.assertTrue(utils_for_tests.array_equal(ctrl_list, describe_cols(test_nd))) ctrl_sa = np.array([('id', 6, 3.5, 1.707825127659933, 1, 6), ('val', 6, 4.5, 1.707825127659933, 2, 7), ('name', np.nan, np.nan, np.nan, np.nan, np.nan)], dtype=[('Column Name', 'S4'), ('Count', float), ('Mean', float), ('Standard Dev', float), ('Minimum', float), ('Maximum', float)]) self.assertTrue(utils_for_tests.array_equal(ctrl_sa, describe_cols(test_sa)))
def test_describe_cols(self): test_list = [[1, 2],[2, 3],[3, 4],[4, 5],[5, 6],[6, 7]] test_nd = np.array(test_list) test_sa = np.array([(1, 2, 'a'), (2, 3, 'b'), (3, 4, 'c'), (4, 5, 'd'), (5, 6, 'e'), (6, 7, 'f')], dtype=[('id', int), ('val', float), ('name', 'S1')]) ctrl_list = np.array([('f0', 6, 3.5, 1.707825127659933, 1, 6), ('f1', 6, 4.5, 1.707825127659933, 2, 7)], dtype=[('Column Name', 'S2'), ('Count', int), ('Mean', float), ('Standard Dev', float), ('Minimum', int), ('Maximum', int)]) self.assertTrue(uft.array_equal(ctrl_list, describe_cols(test_list))) self.assertTrue(uft.array_equal(ctrl_list, describe_cols(test_nd))) ctrl_sa = np.array([('id', 6, 3.5, 1.707825127659933, 1, 6), ('val', 6, 4.5, 1.707825127659933, 2, 7), ('name', np.nan, np.nan, np.nan, np.nan, np.nan)], dtype=[('Column Name', 'S4'), ('Count', float), ('Mean', float), ('Standard Dev', float), ('Minimum', float), ('Maximum', float)]) self.assertTrue(uft.array_equal(ctrl_sa, describe_cols(test_sa)))
def test_basic(self): db_file = uft.path_of_data('rg_students.db') conn_str = 'sqlite:///{}'.format(db_file) ae = array_emitter.ArrayEmitter() ae = ae.get_rg_from_sql(conn_str, 'rg_students') ae = ae.set_aggregation('absences', 'MAX') ae = ae.set_interval(2005, 2007) res = ae.emit_M() ctrl = np.array([(0, 2.2, 3.95, 8.0), (1, 3.45, np.nan, 0.0), (2, 3.4, np.nan, 96.0)], dtype=[('id', '<i8'), ('math_gpa', '<f8'), ('english_gpa', '<f8'), ('absences', '<f8')]) self.assertTrue(uft.array_equal(res, ctrl))
def test_get_top_features(self): M, labels = uft.generate_test_matrix(1000, 15, random_state=0) M = utils.cast_np_sa_to_nd(M) M_train, M_test, labels_train, labels_test = train_test_split( M, labels) clf = RandomForestClassifier(random_state=0) clf.fit(M_train, labels_train) res = dsp.get_top_features(clf, M, verbose=False) ctrl = utils.convert_to_sa( [('f5', 0.0773838526068), ('f13', 0.0769596713039), ('f8', 0.0751584839431), ('f6', 0.0730815879102), ('f11', 0.0684456133071), ('f9', 0.0666747414603), ('f10', 0.0659621889608), ('f7', 0.0657988099065), ('f2', 0.0634000069218), ('f0', 0.0632912268319)], col_names=('feat_name', 'score')) self.assertTrue(uft.array_equal(ctrl, res)) res = dsp.get_top_features(clf, col_names=['f{}'.format(i) for i in xrange(15)], verbose=False) self.assertTrue(uft.array_equal(ctrl, res))
def test_table(self): data = np.array(['a', 'b', 'a', 'b', 'b', 'b', 'b', 'a', 'c', 'c', 'b', 'c', 'a'], dtype='O') ctrl_sa = np.array( [('a', 4), ('b', 6), ('c', 3)], dtype=[('col_name', 'S1'), ('count', int)]) ctrl_printout = """ col_name count 0 a 4 1 b 6 2 c 3 """.strip() with uft.rerout_stdout() as get_stdout: self.assertTrue(uft.array_equal(ctrl_sa, table(data))) self.assertEqual(get_stdout().strip(), ctrl_printout)
def test_get_top_features(self): M, labels = uft.generate_test_matrix(1000, 15, random_state=0) M = utils.cast_np_sa_to_nd(M) M_train, M_test, labels_train, labels_test = train_test_split( M, labels) clf = RandomForestClassifier(random_state=0) clf.fit(M_train, labels_train) res = comm.get_top_features(clf, M, verbose=False) ctrl = utils.convert_to_sa([('f5', 0.0773838526068), ('f13', 0.0769596713039), ('f8', 0.0751584839431), ('f6', 0.0730815879102), ('f11', 0.0684456133071), ('f9', 0.0666747414603), ('f10', 0.0659621889608), ('f7', 0.0657988099065), ('f2', 0.0634000069218), ('f0', 0.0632912268319)], col_names=('feat_name', 'score')) self.assertTrue(uft.array_equal(ctrl, res))
def test_join(self): # test basic inner join a1 = np.array([(0, 'Lisa', 2), (1, 'Bill', 1), (2, 'Fred', 2), (3, 'Samantha', 2), (4, 'Augustine', 1), (5, 'William', 0)], dtype=[('id', int), ('name', 'O'), ('dept_id', int)]) a2 = np.array([(0, 'accts receivable'), (1, 'accts payable'), (2, 'shipping')], dtype=[('id', int), ('name', 'S16')]) ctrl = pd.DataFrame(a1).merge( pd.DataFrame(a2), left_on='dept_id', right_on='id').to_records(index=False) res = utils.join(a1, a2, 'inner', 'dept_id', 'id') self.assertTrue(uft.array_equal(ctrl, res, idx_col='id_x')) # test column naming rules a1 = np.array([(0, 'a', 1, 2, 3)], dtype=[('idx0', int), ('name', 'O'), ('a1_idx1', int), ('idx2', int), ('idx3', int)]) a2 = np.array([(0, 'b', 1, 2, 3)], dtype=[('idx0', int), ('name', 'O'), ('a2_idx1', int), ('idx2', int), ('idx3', int)]) pd1 = pd.DataFrame(a1) pd2 = pd.DataFrame(a2) ctrl = pd1.merge( pd2, left_on=['idx0', 'a1_idx1', 'idx2'], right_on=['idx0', 'a2_idx1', 'idx2'], suffixes=['_left', '_right']).to_records(index=False) res = utils.join( a1, a2, 'inner', left_on=['idx0', 'a1_idx1', 'idx2'], right_on=['idx0', 'a2_idx1', 'idx2'], suffixes=['_left', '_right']) self.assertTrue(uft.array_equal(ctrl, res, idx_col='idx0')) # outer joins a1 = np.array( [(0, 'a1_0', 0), (1, 'a1_1', 1), (1, 'a1_2', 2), (2, 'a1_3', 3), (3, 'a1_4', 4)], dtype=[('key', int), ('label', 'O'), ('idx', int)]) a2 = np.array( [(0, 'a2_0', 0), (1, 'a2_1', 1), (2, 'a2_2', 2), (2, 'a2_3', 3), (4, 'a2_4', 4)], dtype=[('key', int), ('label', 'O'), ('idx', int)]) #for how in ('inner', 'left', 'right', 'outer'): merged_dtype = [('key', int), ('label_x', 'O'), ('idx_x', int), ('label_y', 'O'), ('idx_y', int)] merge_algos = ('inner', 'left', 'right', 'outer') merged_data = [[(0, 'a1_0', 0, 'a2_0', 0), (1, 'a1_1', 1, 'a2_1', 1), (1, 'a1_2', 2, 'a2_1', 1), (2, 'a1_3', 3, 'a2_2', 2), (2, 'a1_3', 3, 'a2_3', 3)], [(0, 'a1_0', 0, 'a2_0', 0), (1, 'a1_1', 1, 'a2_1', 1), (1, 'a1_2', 2, 'a2_1', 1), (2, 'a1_3', 3, 'a2_2', 2), (2, 'a1_3', 3, 'a2_3', 3), (3, 'a1_4', 4, '', -999)], [(0, 'a1_0', 0, 'a2_0', 0), (1, 'a1_1', 1, 'a2_1', 1), (1, 'a1_2', 2, 'a2_1', 1), (2, 'a1_3', 3, 'a2_2', 2), (2, 'a1_3', 3, 'a2_3', 3), (4, '', -999, 'a2_4', 4)], [(0, 'a1_0', 0, 'a2_0', 0), (1, 'a1_1', 1, 'a2_1', 1), (1, 'a1_2', 2, 'a2_1', 1), (2, 'a1_3', 3, 'a2_2', 2), (2, 'a1_3', 3, 'a2_3', 3), (4, '', -999, 'a2_4', 4), (3, 'a1_4', 4, '', -999)]] for how, data in zip(merge_algos, merged_data): res = utils.join( a1, a2, how, left_on='key', right_on='key') ctrl = np.array(data, dtype=merged_dtype) self.assertTrue(uft.array_equal(ctrl, res))