def test_sort_values(self): frame = DataFrame([[1, 1, 2], [3, 1, 0], [4, 5, 6]], index=[1, 2, 3], columns=list('ABC')) # by column (axis=0) sorted_df = frame.sort_values(by='A') indexer = frame['A'].argsort().values expected = frame.ix[frame.index[indexer]] assert_frame_equal(sorted_df, expected) sorted_df = frame.sort_values(by='A', ascending=False) indexer = indexer[::-1] expected = frame.ix[frame.index[indexer]] assert_frame_equal(sorted_df, expected) sorted_df = frame.sort_values(by='A', ascending=False) assert_frame_equal(sorted_df, expected) # GH4839 sorted_df = frame.sort_values(by=['A'], ascending=[False]) assert_frame_equal(sorted_df, expected) # multiple bys sorted_df = frame.sort_values(by=['B', 'C']) expected = frame.loc[[2, 1, 3]] assert_frame_equal(sorted_df, expected) sorted_df = frame.sort_values(by=['B', 'C'], ascending=False) assert_frame_equal(sorted_df, expected[::-1]) sorted_df = frame.sort_values(by=['B', 'A'], ascending=[True, False]) assert_frame_equal(sorted_df, expected) self.assertRaises(ValueError, lambda: frame.sort_values( by=['A', 'B'], axis=2, inplace=True)) # by row (axis=1): GH 10806 sorted_df = frame.sort_values(by=3, axis=1) expected = frame assert_frame_equal(sorted_df, expected) sorted_df = frame.sort_values(by=3, axis=1, ascending=False) expected = frame.reindex(columns=['C', 'B', 'A']) assert_frame_equal(sorted_df, expected) sorted_df = frame.sort_values(by=[1, 2], axis='columns') expected = frame.reindex(columns=['B', 'A', 'C']) assert_frame_equal(sorted_df, expected) sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=[True, False]) assert_frame_equal(sorted_df, expected) sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=False) expected = frame.reindex(columns=['C', 'B', 'A']) assert_frame_equal(sorted_df, expected) msg = r'Length of ascending \(5\) != length of by \(2\)' with assertRaisesRegexp(ValueError, msg): frame.sort_values(by=['A', 'B'], axis=0, ascending=[True] * 5)
def prepareBreakagebreakageSummary(breakageData, stlSalesSamePeriod, kcSalesSamePeriod, reportYear, lastYear): ''' Takes in clean data and gets it ready for consumption ''' aggFuncs = { 'Breakage|Dollars' : np.sum, 'Breakage|Cases' : np.sum } groupCols = ['Warehouse','ReasonCode','Year'] breakageSummary = DataFrame(breakageData.groupby(groupCols).agg(aggFuncs).reset_index(drop=False)) breakageSummary = pd.DataFrame(breakageSummary.pivot_table(values=['Breakage|Cases','Breakage|Dollars'], index=['Warehouse','ReasonCode'], columns=['Year'])) breakageSummary.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in breakageSummary.columns] breakageSummary.sort_index(inplace=True, ascending=False) breakageSummary['Breakage|% Sales'] = breakageSummary.index.get_level_values(0) breakageSummary['Breakage|% Sales'] = breakageSummary['Breakage|% Sales'].map({'Kansas City':kcSalesSamePeriod, 'Saint Louis':stlSalesSamePeriod}) breakageSummary['Breakage|% Sales'] = np.divide(breakageSummary['Breakage|Dollars|2016'], breakageSummary['Breakage|% Sales']) def yoy_delta(now, then): return np.divide(np.subtract(now,then), then) breakageSummary['Breakage|Dollars|% Change'] = round(yoy_delta(breakageSummary['Breakage|Dollars|'+str(reportYear)], breakageSummary['Breakage|Dollars|'+str(lastYear)]),4) breakageSummary['Breakage|Cases|% Change'] = round(yoy_delta(breakageSummary['Breakage|Cases|'+str(reportYear)], breakageSummary['Breakage|Cases|'+str(lastYear)]),4) breakageSummary = breakageSummary.reindex(columns=['Breakage|Dollars|'+str(lastYear), 'Breakage|Dollars|'+str(reportYear), 'Breakage|Dollars|% Change', 'Breakage|% Sales', 'Breakage|Cases|'+str(lastYear), 'Breakage|Cases|'+str(reportYear), 'Breakage|Cases|% Change']) breakageSummary = breakageSummary.reindex(index=['Warehouse Breakage','Cross-Dock Breakage','Driver Breakage','Supplier Breakage','Sales Breakage & Unsaleables'], level='ReasonCode') return breakageSummary
def test_na_values_keep_default(self): data = """\ One,Two,Three a,1,one b,2,two ,3,three d,4,nan e,5,five nan,6, g,7,seven """ df = self.read_csv(StringIO(data)) xp = DataFrame({'One': ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'], 'Two': [1, 2, 3, 4, 5, 6, 7], 'Three': ['one', 'two', 'three', np.nan, 'five', np.nan, 'seven']}) tm.assert_frame_equal(xp.reindex(columns=df.columns), df) df = self.read_csv(StringIO(data), na_values={'One': [], 'Three': []}, keep_default_na=False) xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'], 'Two': [1, 2, 3, 4, 5, 6, 7], 'Three': ['one', 'two', 'three', 'nan', 'five', '', 'seven']}) tm.assert_frame_equal(xp.reindex(columns=df.columns), df) df = self.read_csv( StringIO(data), na_values=['a'], keep_default_na=False) xp = DataFrame({'One': [np.nan, 'b', '', 'd', 'e', 'nan', 'g'], 'Two': [1, 2, 3, 4, 5, 6, 7], 'Three': ['one', 'two', 'three', 'nan', 'five', '', 'seven']}) tm.assert_frame_equal(xp.reindex(columns=df.columns), df) df = self.read_csv(StringIO(data), na_values={'One': [], 'Three': []}) xp = DataFrame({'One': ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'], 'Two': [1, 2, 3, 4, 5, 6, 7], 'Three': ['one', 'two', 'three', np.nan, 'five', np.nan, 'seven']}) tm.assert_frame_equal(xp.reindex(columns=df.columns), df) # see gh-4318: passing na_values=None and # keep_default_na=False yields 'None' as a na_value data = """\ One,Two,Three a,1,None b,2,two ,3,None d,4,nan e,5,five nan,6, g,7,seven """ df = self.read_csv( StringIO(data), keep_default_na=False) xp = DataFrame({'One': ['a', 'b', '', 'd', 'e', 'nan', 'g'], 'Two': [1, 2, 3, 4, 5, 6, 7], 'Three': ['None', 'two', 'None', 'nan', 'five', '', 'seven']}) tm.assert_frame_equal(xp.reindex(columns=df.columns), df)
class Reindex(object): def setup(self): rng = date_range(start='1/1/1970', periods=10000, freq='1min') self.df = DataFrame(np.random.rand(10000, 10), index=rng, columns=range(10)) self.df['foo'] = 'bar' self.rng_subset = Index(rng[::2]) self.df2 = DataFrame(index=range(10000), data=np.random.rand(10000, 30), columns=range(30)) N = 5000 K = 200 level1 = tm.makeStringIndex(N).values.repeat(K) level2 = np.tile(tm.makeStringIndex(K).values, N) index = MultiIndex.from_arrays([level1, level2]) self.s = Series(np.random.randn(N * K), index=index) self.s_subset = self.s[::2] def time_reindex_dates(self): self.df.reindex(self.rng_subset) def time_reindex_columns(self): self.df2.reindex(columns=self.df.columns[1:5]) def time_reindex_multiindex(self): self.s.reindex(self.s_subset.index)
def test_reindex_api_equivalence(self): # https://github.com/pandas-dev/pandas/issues/12392 # equivalence of the labels/axis and index/columns API's df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]], index=['a', 'b', 'c'], columns=['d', 'e', 'f']) res1 = df.reindex(['b', 'a']) res2 = df.reindex(index=['b', 'a']) res3 = df.reindex(labels=['b', 'a']) res4 = df.reindex(labels=['b', 'a'], axis=0) res5 = df.reindex(['b', 'a'], axis=0) for res in [res2, res3, res4, res5]: tm.assert_frame_equal(res1, res) res1 = df.reindex(columns=['e', 'd']) res2 = df.reindex(['e', 'd'], axis=1) res3 = df.reindex(labels=['e', 'd'], axis=1) for res in [res2, res3]: tm.assert_frame_equal(res1, res) with tm.assert_produces_warning(FutureWarning) as m: res1 = df.reindex(['b', 'a'], ['e', 'd']) assert 'reindex' in str(m[0].message) res2 = df.reindex(columns=['e', 'd'], index=['b', 'a']) res3 = df.reindex(labels=['b', 'a'], axis=0).reindex(labels=['e', 'd'], axis=1) for res in [res2, res3]: tm.assert_frame_equal(res1, res)
def test_join_multiindex(self): index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'], [1, 2, 3, 1, 2, 3]], names=['first', 'second']) index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'], [1, 2, 3, 1, 2, 3]], names=['first', 'second']) df1 = DataFrame(data=np.random.randn(6), index=index1, columns=['var X']) df2 = DataFrame(data=np.random.randn(6), index=index2, columns=['var Y']) df1 = df1.sort_index(level=0) df2 = df2.sort_index(level=0) joined = df1.join(df2, how='outer') ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names assert_frame_equal(joined, expected) assert joined.index.names == index1.names df1 = df1.sort_index(level=1) df2 = df2.sort_index(level=1) joined = df1.join(df2, how='outer').sort_index(level=0) ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names assert_frame_equal(joined, expected) assert joined.index.names == index1.names
def strategy_statistics(strategy_name): all_qr = QR.objects(strategy_name=strategy_name) if not all_qr: print "Wrong Strategy Name!" return trading_date = SDT.objects(stock_number__startswith="300").distinct("date") trading_date.sort() bt_result = {} for d in trading_date: bt_result[str(d.date())] = back_test_success(strategy_name, d) frame = DataFrame(bt_result) pd.set_option("display.width", 200) pd.set_option("display.max_rows", 400) print frame.reindex( [ "count", "one_back_test", "one_yield_expectation", "three_back_test", "three_yield_expectation", "five_back_test", "five_yield_expectation", ] ).T pd.set_option("display.width", None) pd.set_option("display.max_rows", None)
def pd_dataframe5(): frame = DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],columns=['Ohio', 'Texas', 'California']) print frame frame2=frame.reindex(['a','b','c','d']) print frame2 stats=['Texas','Utah','California'] print frame.reindex(columns=stats) frame=frame.reindex(index=['a','b','c','d'],method='ffill',columns=stats) print frame print frame.ix[['a','b','c','d'],stats]
def test_reindex_boolean(self): frame = DataFrame(np.ones((10, 2), dtype=bool), index=np.arange(0, 20, 2), columns=[0, 2]) reindexed = frame.reindex(np.arange(10)) self.assertEqual(reindexed.values.dtype, np.object_) self.assertTrue(isnull(reindexed[0][1])) reindexed = frame.reindex(columns=lrange(3)) self.assertEqual(reindexed.values.dtype, np.object_) self.assertTrue(isnull(reindexed[1]).all())
def test_reindex_api_equivalence(self): # equivalence of the labels/axis and index/columns API's df = DataFrame([[1, 2, 3], [3, 4, 5], [5, 6, 7]], index=['a', 'b', 'c'], columns=['d', 'e', 'f']) res1 = df.reindex(['b', 'a']) res2 = df.reindex(index=['b', 'a']) res3 = df.reindex(labels=['b', 'a']) res4 = df.reindex(labels=['b', 'a'], axis=0) res5 = df.reindex(['b', 'a'], axis=0) for res in [res2, res3, res4, res5]: tm.assert_frame_equal(res1, res) res1 = df.reindex(columns=['e', 'd']) res2 = df.reindex(['e', 'd'], axis=1) res3 = df.reindex(labels=['e', 'd'], axis=1) for res in [res2, res3]: tm.assert_frame_equal(res1, res) res1 = df.reindex(index=['b', 'a'], columns=['e', 'd']) res2 = df.reindex(columns=['e', 'd'], index=['b', 'a']) res3 = df.reindex(labels=['b', 'a'], axis=0).reindex(labels=['e', 'd'], axis=1) for res in [res2, res3]: tm.assert_frame_equal(res1, res)
def test_reindex_boolean(self): frame = DataFrame(np.ones((10, 2), dtype=bool), index=np.arange(0, 20, 2), columns=[0, 2]) reindexed = frame.reindex(np.arange(10)) assert reindexed.values.dtype == np.object_ assert isna(reindexed[0][1]) reindexed = frame.reindex(columns=lrange(3)) assert reindexed.values.dtype == np.object_ assert isna(reindexed[1]).all()
class Reindex(object): goal_time = 0.2 def setup(self): N = 10**3 self.df = DataFrame(np.random.randn(N * 10, N)) self.idx = np.arange(4 * N, 7 * N) self.df2 = DataFrame( {c: {0: np.random.randint(0, 2, N).astype(np.bool_), 1: np.random.randint(0, N, N).astype(np.int16), 2: np.random.randint(0, N, N).astype(np.int32), 3: np.random.randint(0, N, N).astype(np.int64)} [np.random.randint(0, 4)] for c in range(N)}) def time_reindex_axis0(self): self.df.reindex(self.idx) def time_reindex_axis1(self): self.df.reindex(columns=self.idx) def time_reindex_both_axes(self): self.df.reindex(index=self.idx, columns=self.idx) def time_reindex_both_axes_ix(self): self.df.ix[self.idx, self.idx] def time_reindex_upcast(self): self.df2.reindex(np.random.permutation(range(1200)))
def test_reindex_multi(self): df = DataFrame(np.random.randn(3, 3)) result = df.reindex(lrange(4), lrange(4)) expected = df.reindex(lrange(4)).reindex(columns=lrange(4)) assert_frame_equal(result, expected) df = DataFrame(np.random.randint(0, 10, (3, 3))) result = df.reindex(lrange(4), lrange(4)) expected = df.reindex(lrange(4)).reindex(columns=lrange(4)) assert_frame_equal(result, expected) df = DataFrame(np.random.randint(0, 10, (3, 3))) result = df.reindex(lrange(2), lrange(2)) expected = df.reindex(lrange(2)).reindex(columns=lrange(2)) assert_frame_equal(result, expected) df = DataFrame(np.random.randn(5, 3) + 1j, columns=['a', 'b', 'c']) result = df.reindex(index=[0, 1], columns=['a', 'b']) expected = df.reindex([0, 1]).reindex(columns=['a', 'b']) assert_frame_equal(result, expected)
def test_reindex_axes(self): # GH 3317, reindexing by both axes loses freq of the index df = DataFrame( np.ones((3, 3)), index=[datetime(2012, 1, 1), datetime(2012, 1, 2), datetime(2012, 1, 3)], columns=["a", "b", "c"], ) time_freq = date_range("2012-01-01", "2012-01-03", freq="d") some_cols = ["a", "b"] index_freq = df.reindex(index=time_freq).index.freq both_freq = df.reindex(index=time_freq, columns=some_cols).index.freq seq_freq = df.reindex(index=time_freq).reindex(columns=some_cols).index.freq self.assertEqual(index_freq, both_freq) self.assertEqual(index_freq, seq_freq)
def test_reindex_name_remains(self): s = Series(random.rand(10)) df = DataFrame(s, index=np.arange(len(s))) i = Series(np.arange(10), name='iname') df = df.reindex(i) self.assertEqual(df.index.name, 'iname') df = df.reindex(Index(np.arange(10), name='tmpname')) self.assertEqual(df.index.name, 'tmpname') s = Series(random.rand(10)) df = DataFrame(s.T, index=np.arange(len(s))) i = Series(np.arange(10), name='iname') df = df.reindex(columns=i) self.assertEqual(df.columns.name, 'iname')
def test_reindex_with_nans(self): df = DataFrame([[1, 2], [3, 4], [np.nan, np.nan], [7, 8], [9, 10]], columns=['a', 'b'], index=[100.0, 101.0, np.nan, 102.0, 103.0]) result = df.reindex(index=[101.0, 102.0, 103.0]) expected = df.iloc[[1, 3, 4]] assert_frame_equal(result, expected) result = df.reindex(index=[103.0]) expected = df.iloc[[4]] assert_frame_equal(result, expected) result = df.reindex(index=[101.0]) expected = df.iloc[[1]] assert_frame_equal(result, expected)
def test_reindex_axes(self): # GH 3317, reindexing by both axes loses freq of the index df = DataFrame(np.ones((3, 3)), index=[datetime(2012, 1, 1), datetime(2012, 1, 2), datetime(2012, 1, 3)], columns=['a', 'b', 'c']) time_freq = date_range('2012-01-01', '2012-01-03', freq='d') some_cols = ['a', 'b'] index_freq = df.reindex(index=time_freq).index.freq both_freq = df.reindex(index=time_freq, columns=some_cols).index.freq seq_freq = df.reindex(index=time_freq).reindex( columns=some_cols).index.freq self.assertEqual(index_freq, both_freq) self.assertEqual(index_freq, seq_freq)
def test_dti_set_index_reindex(self): # GH 6631 df = DataFrame(np.random.random(6)) idx1 = date_range('2011/01/01', periods=6, freq='M', tz='US/Eastern') idx2 = date_range('2013', periods=6, freq='A', tz='Asia/Tokyo') df = df.set_index(idx1) tm.assert_index_equal(df.index, idx1) df = df.reindex(idx2) tm.assert_index_equal(df.index, idx2) # 11314 # with tz index = date_range(datetime(2015, 10, 1), datetime(2015, 10, 1, 23), freq='H', tz='US/Eastern') df = DataFrame(np.random.randn(24, 1), columns=['a'], index=index) new_index = date_range(datetime(2015, 10, 2), datetime(2015, 10, 2, 23), freq='H', tz='US/Eastern') # TODO: unused? result = df.set_index(new_index) # noqa assert new_index.freq == index.freq
def pickle_from_db(event_list, fname, verbose=False): for event in event_list: result = DataFrame({"text": [], "event": [], "features": [], "unique_id": [], "raw_text": []}) count = 0 if verbose: print "processing data from %s" % (event) examples = client[insert_db][event].find() for tweet in examples: if verbose and count % 1000 == 0 and count != 0: print "processed %s tweets" % count if tweet["text"]: result = result.append( DataFrame( { "text": tweet["text"], "event": event, "features": json.dumps(tweet["features"]), "unique_id": tweet["unique_id"], "raw_text": tweet["raw_text"], }, index=[count], ) ) count += 1 if count == 50: break result = result.reindex(numpy.random.permutation(result.index)) fpath = os.path.join(os.path.dirname(__file__), os.pardir, "dicts/") + event + "_" + fname f = open(fpath, "w") pickle.dump(result, f) f.close() if verbose: print result print "dumped %s tweets" % len(result)
def test_include_na(self, sparse, dtype): if sparse: pytest.xfail(reason='nan in index is problematic (GH 16894)') s = ['a', 'b', np.nan] res = get_dummies(s, sparse=sparse, dtype=dtype) exp = DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0]}, dtype=self.effective_dtype(dtype)) assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype) exp_na = DataFrame({nan: [0, 0, 1], 'a': [1, 0, 0], 'b': [0, 1, 0]}, dtype=self.effective_dtype(dtype)) exp_na = exp_na.reindex(['a', 'b', nan], axis=1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True, sparse=sparse, dtype=dtype) exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan], dtype=self.effective_dtype(dtype)) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
def test_preserve_categorical_dtype(): # GH13743, GH13854 df = DataFrame({'A': [1, 2, 1, 1, 2], 'B': [10, 16, 22, 28, 34], 'C1': Categorical(list("abaab"), categories=list("bac"), ordered=False), 'C2': Categorical(list("abaab"), categories=list("bac"), ordered=True)}) # single grouper exp_full = DataFrame({'A': [2.0, 1.0, np.nan], 'B': [25.0, 20.0, np.nan], 'C1': Categorical(list("bac"), categories=list("bac"), ordered=False), 'C2': Categorical(list("bac"), categories=list("bac"), ordered=True)}) for col in ['C1', 'C2']: result1 = df.groupby(by=col, as_index=False, observed=False).mean() result2 = df.groupby( by=col, as_index=True, observed=False).mean().reset_index() expected = exp_full.reindex(columns=result1.columns) tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected)
def plot_scores(scores, title, x_label, classifier_names): """ Make a barplot of the scores of some performance measure. Parameters ---------- scores : dict Where the keys are the classifier names and the values are the scores. title : str Title of the plot. x_label : str Label for the x-axis classifier_names : array List of the names of the classifiers, the order of which will be used to order the bars. """ scores = DataFrame(scores, index=[x_label]) scores = scores.reindex(columns=classifier_names) format_as_percent_plot = lambda x, pos: "{:.0f}%".format(x * 100) fig, ax = plt.subplots(figsize=(9, 5)) scores.plot(ax=ax, kind="bar", title=title, fontsize=12) ax.legend(bbox_to_anchor = (1.5, 0.6)) ax.set_xticklabels([], rotation=0) ax.get_yaxis().set_major_formatter(FuncFormatter(format_as_percent_plot)) plt.show()
def main(): # Series 可以看做一个定长的有序字典。 s1 = Series([1,2,3.0,'abc']) print s1 print s2 = Series(data=[1,3,5,7],index = ['a','b','x','y']) print s2 print s2.index print s2.values s2.name = 'a_series' s2.index.name = 'the_index' print s2 ser = Series([4.5,7.2,-5.3,3.6],index=['d','b','a','c']) # reindex a = ['a','b','c','d','e'] ser_1 = ser.reindex(a) print ser_1 ser_2 = ser.reindex(a, fill_value=0) print ser_2 print # DataFrame 是一个表格型的数据结构,它含有一组有序的列(类似于 index),每列可以是不同的值类型(不像 ndarray 只能有一个 dtype)。 # 基本上可以把 DataFrame 看成是共享同一个 index 的 Series 的集合。 data = {'state':['Ohino','Ohino','Ohino','Nevada','Nevada'], 'year':[2000,2001,2002,2001,2002], 'pop':[1.5,1.7,3.6,2.4,2.9]} df = DataFrame(data) print df df = DataFrame(data, index=['one','two','three','four','five'], columns=['year','state','pop','debt']) print df print df.index print df.columns print type(df['debt']) state = ['Texas','Utha','California'] df1 = df.reindex(columns=state, method='ffill') print df1 print
def test_reindex_dups(self): # GH4746, reindex on duplicate index error messages arr = np.random.randn(10) df = DataFrame(arr, index=[1, 2, 3, 4, 5, 1, 2, 3, 4, 5]) # set index is ok result = df.copy() result.index = list(range(len(df))) expected = DataFrame(arr, index=list(range(len(df)))) assert_frame_equal(result, expected) # reindex fails msg = "cannot reindex from a duplicate axis" with pytest.raises(ValueError, match=msg): df.reindex(index=list(range(len(df))))
def readDatasetIntoDataFrame(): #Open file f = open("SpamHamDataset.txt", "r"); #New DataFrame with two columns df = DataFrame(columns=('label', 'text')) count = 0 for line in f: tokens = line.split() flag = tokens[0] #The first word of each row is the label. text = "" #Concatenate all tokens, except the label, to get the content of the message itself. for x in range(1, tokens.__len__()): text = text + tokens[x] text = text + " " sig = 0 if flag == 'spam': sig = 1 #print label, "---", text df.loc[count] = [sig, text] count = count + 1 #Housekeeping df = df.reindex(random.permutation(df.index)) return df
def viz_dist_mat(df, new_index, show_img=True): ''' Re-order a triangular data frame. ''' from pandas import DataFrame sym_dist = df.values.T + df.values sym_df = DataFrame(sym_dist, index=df.index, columns=df.columns) reorder_df = sym_df.reindex(index=new_index, columns=new_index) # Now restore only the upper triangle upptri_df = DataFrame(reorder_df.values * (df.values != 0.0), index=new_index, columns=new_index) if show_img: import matplotlib.pyplot as p p.imshow(upptri_df.values, interpolation='nearest', cmap='binary') cbar = p.colorbar() cbar.set_label('Distance', fontsize=20) p.show() return upptri_df
def test_include_na(self, sparse, dtype): s = ['a', 'b', np.nan] res = get_dummies(s, sparse=sparse, dtype=dtype) exp = DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0]}, dtype=self.effective_dtype(dtype)) if sparse: exp = exp.apply(pd.SparseArray, fill_value=0.0) assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype) exp_na = DataFrame({nan: [0, 0, 1], 'a': [1, 0, 0], 'b': [0, 1, 0]}, dtype=self.effective_dtype(dtype)) exp_na = exp_na.reindex(['a', 'b', nan], axis=1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns if sparse: exp_na = exp_na.apply(pd.SparseArray, fill_value=0.0) assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True, sparse=sparse, dtype=dtype) exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan], dtype=self.effective_dtype(dtype)) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
def test_unexpected_keyword(self): # GH8597 df = DataFrame(np.random.randn(5, 2), columns=['jim', 'joe']) ca = pd.Categorical([0, 0, 2, 2, 3, np.nan]) ts = df['joe'].copy() ts[2] = np.nan with pytest.raises(TypeError, match='unexpected keyword'): df.drop('joe', axis=1, in_place=True) with pytest.raises(TypeError, match='unexpected keyword'): df.reindex([1, 0], inplace=True) with pytest.raises(TypeError, match='unexpected keyword'): ca.fillna(0, inplace=True) with pytest.raises(TypeError, match='unexpected keyword'): ts.fillna(0, in_place=True)
class LevelAlign(object): def setup(self): self.index = MultiIndex( levels=[np.arange(10), np.arange(100), np.arange(100)], codes=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) def time_align_level(self): self.df.align(self.df_level, level=1, copy=False) def time_reindex_level(self): self.df_level.reindex(self.index, level=1)
def testWithXEffects(self): result = ols(y=self.panel_y2, x=self.panel_x2, x_effects=['x1']) assert_almost_equal(result._y.values.flat, [1, 4, 5]) res = result._x exp_x = DataFrame([[0., 0., 14., 1.], [0, 1, 17, 1], [1, 0, 48, 1]], columns=['x1_30', 'x1_9', 'x2', 'intercept'], index=res.index, dtype=float) assert_frame_equal(res, exp_x.reindex(columns=res.columns))
#reindexing series and dataframes from pandas import Series, DataFrame obj = Series([100, 200, 300, 400, 500], index=['d', 'a', 'b', 'e', 'c']) print(obj) #reindexing Series obj = obj.reindex(['a', 'b', 'c', 'd', 'e']) print(obj) #---------------------------------------------------------- data = { 'Name': ['John', 'Kevin', 'Sam'], 'Age': [32, 42, 54], 'Salary': [300, 400, 500] } frame = DataFrame(data) print(frame) #reindexing row of DataFrame frame = frame.reindex([0, 2, 1]) print(frame) #reindexing column of DataFrame fields = ['Age', 'Name', 'Salary'] frame = frame.reindex(columns=fields) print(frame)
def test_sort_values_nan(self): # GH#3917 df = DataFrame({ "A": [1, 2, np.nan, 1, 6, 8, 4], "B": [9, np.nan, 5, 2, 5, 4, 5] }) # sort one column only expected = DataFrame( { "A": [np.nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, np.nan, 5, 5, 4] }, index=[2, 0, 3, 1, 6, 4, 5], ) sorted_df = df.sort_values(["A"], na_position="first") tm.assert_frame_equal(sorted_df, expected) expected = DataFrame( { "A": [np.nan, 8, 6, 4, 2, 1, 1], "B": [5, 4, 5, 5, np.nan, 9, 2] }, index=[2, 5, 4, 6, 1, 0, 3], ) sorted_df = df.sort_values(["A"], na_position="first", ascending=False) tm.assert_frame_equal(sorted_df, expected) expected = df.reindex(columns=["B", "A"]) sorted_df = df.sort_values(by=1, axis=1, na_position="first") tm.assert_frame_equal(sorted_df, expected) # na_position='last', order expected = DataFrame( { "A": [1, 1, 2, 4, 6, 8, np.nan], "B": [2, 9, np.nan, 5, 5, 4, 5] }, index=[3, 0, 1, 6, 4, 5, 2], ) sorted_df = df.sort_values(["A", "B"]) tm.assert_frame_equal(sorted_df, expected) # na_position='first', order expected = DataFrame( { "A": [np.nan, 1, 1, 2, 4, 6, 8], "B": [5, 2, 9, np.nan, 5, 5, 4] }, index=[2, 3, 0, 1, 6, 4, 5], ) sorted_df = df.sort_values(["A", "B"], na_position="first") tm.assert_frame_equal(sorted_df, expected) # na_position='first', not order expected = DataFrame( { "A": [np.nan, 1, 1, 2, 4, 6, 8], "B": [5, 9, 2, np.nan, 5, 5, 4] }, index=[2, 0, 3, 1, 6, 4, 5], ) sorted_df = df.sort_values(["A", "B"], ascending=[1, 0], na_position="first") tm.assert_frame_equal(sorted_df, expected) # na_position='last', not order expected = DataFrame( { "A": [8, 6, 4, 2, 1, 1, np.nan], "B": [4, 5, 5, np.nan, 2, 9, 5] }, index=[5, 4, 6, 1, 3, 0, 2], ) sorted_df = df.sort_values(["A", "B"], ascending=[0, 1], na_position="last") tm.assert_frame_equal(sorted_df, expected)
def test_reindex_single_named_indexer(self): # https://github.com/pandas-dev/pandas/issues/12392 df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3]}) result = df.reindex([0, 1], columns=["A"]) expected = DataFrame({"A": [1, 2]}) tm.assert_frame_equal(result, expected)
def test_sort_values(self): frame = DataFrame([[1, 1, 2], [3, 1, 0], [4, 5, 6]], index=[1, 2, 3], columns=list("ABC")) # by column (axis=0) sorted_df = frame.sort_values(by="A") indexer = frame["A"].argsort().values expected = frame.loc[frame.index[indexer]] tm.assert_frame_equal(sorted_df, expected) sorted_df = frame.sort_values(by="A", ascending=False) indexer = indexer[::-1] expected = frame.loc[frame.index[indexer]] tm.assert_frame_equal(sorted_df, expected) sorted_df = frame.sort_values(by="A", ascending=False) tm.assert_frame_equal(sorted_df, expected) # GH4839 sorted_df = frame.sort_values(by=["A"], ascending=[False]) tm.assert_frame_equal(sorted_df, expected) # multiple bys sorted_df = frame.sort_values(by=["B", "C"]) expected = frame.loc[[2, 1, 3]] tm.assert_frame_equal(sorted_df, expected) sorted_df = frame.sort_values(by=["B", "C"], ascending=False) tm.assert_frame_equal(sorted_df, expected[::-1]) sorted_df = frame.sort_values(by=["B", "A"], ascending=[True, False]) tm.assert_frame_equal(sorted_df, expected) msg = "No axis named 2 for object type DataFrame" with pytest.raises(ValueError, match=msg): frame.sort_values(by=["A", "B"], axis=2, inplace=True) # by row (axis=1): GH#10806 sorted_df = frame.sort_values(by=3, axis=1) expected = frame tm.assert_frame_equal(sorted_df, expected) sorted_df = frame.sort_values(by=3, axis=1, ascending=False) expected = frame.reindex(columns=["C", "B", "A"]) tm.assert_frame_equal(sorted_df, expected) sorted_df = frame.sort_values(by=[1, 2], axis="columns") expected = frame.reindex(columns=["B", "A", "C"]) tm.assert_frame_equal(sorted_df, expected) sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=[True, False]) tm.assert_frame_equal(sorted_df, expected) sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=False) expected = frame.reindex(columns=["C", "B", "A"]) tm.assert_frame_equal(sorted_df, expected) msg = r"Length of ascending \(5\) != length of by \(2\)" with pytest.raises(ValueError, match=msg): frame.sort_values(by=["A", "B"], axis=0, ascending=[True] * 5)
def test_reindex_with_categoricalindex(self): df = DataFrame( { "A": np.arange(3, dtype="int64"), }, index=CategoricalIndex(list("abc"), dtype=CDT(list("cabe")), name="B"), ) # reindexing # convert to a regular index result = df.reindex(["a", "b", "e"]) expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( "B" ) tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(["a", "b"]) expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(["e"]) expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(["d"]) expected = DataFrame({"A": [np.nan], "B": Series(["d"])}).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) # since we are actually reindexing with a Categorical # then return a Categorical cats = list("cabe") result = df.reindex(Categorical(["a", "e"], categories=cats)) expected = DataFrame( {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))} ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(Categorical(["a"], categories=cats)) expected = DataFrame( {"A": [0], "B": Series(list("a")).astype(CDT(cats))} ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(["a", "b", "e"]) expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( "B" ) tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(["a", "b"]) expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(["e"]) expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) # give back the type of categorical that we received result = df.reindex(Categorical(["a", "e"], categories=cats, ordered=True)) expected = DataFrame( {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))} ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(Categorical(["a", "d"], categories=["a", "d"])) expected = DataFrame( {"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))} ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) df2 = DataFrame( { "A": np.arange(6, dtype="int64"), }, index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"), ) # passed duplicate indexers are not allowed msg = "cannot reindex from a duplicate axis" with pytest.raises(ValueError, match=msg): df2.reindex(["a", "b"]) # args NotImplemented ATM msg = r"argument {} is not implemented for CategoricalIndex\.reindex" with pytest.raises(NotImplementedError, match=msg.format("method")): df.reindex(["a"], method="ffill") with pytest.raises(NotImplementedError, match=msg.format("level")): df.reindex(["a"], level=1) with pytest.raises(NotImplementedError, match=msg.format("limit")): df.reindex(["a"], limit=2)
class Scores: """ Parameters ---------- uri : str, optional modality : str, optional Returns ------- scores : `Scores` Examples -------- >>> s = Scores(uri='video', modality='speaker') >>> s[Segment(0,1), 's1', 'A'] = 0.1 >>> s[Segment(0,1), 's1', 'B'] = 0.2 >>> s[Segment(0,1), 's1', 'C'] = 0.3 >>> s[Segment(0,1), 's2', 'A'] = 0.4 >>> s[Segment(0,1), 's2', 'B'] = 0.3 >>> s[Segment(0,1), 's2', 'C'] = 0.2 >>> s[Segment(2,3), 's1', 'A'] = 0.2 >>> s[Segment(2,3), 's1', 'B'] = 0.1 >>> s[Segment(2,3), 's1', 'C'] = 0.3 """ @classmethod def from_df(cls, df: DataFrame, uri: Optional[str] = None, modality: Optional[str] = None, aggfunc: Callable = np.mean): """ Parameters ---------- df : DataFrame Must contain the following columns: 'segment', 'track', 'label' and 'value' uri : str, optional Resource identifier modality : str, optional Modality aggfunc : func Value aggregation function in case of duplicate (segment, track, label) tuples Returns ------- """ dataframe = pivot_table(df, values=PYANNOTE_SCORE, index=[PYANNOTE_SEGMENT, PYANNOTE_TRACK], columns=PYANNOTE_LABEL, aggfunc=aggfunc) annotation = Annotation(uri=uri, modality=modality) for index, _ in dataframe.iterrows(): segment = Segment(*index[0]) track = index[1] annotation[segment, track] = '' labels = dataframe.columns return cls(uri=uri, modality=modality, annotation=annotation, labels=labels, values=dataframe.values) def __init__(self, uri: Optional[str] = None, modality: Optional[str] = None, annotation: Optional[Annotation] = None, labels: Iterable[Hashable] = None, values: Optional[np.ndarray] = None, dtype=None): # TODO maybe this should get removed names = [ PYANNOTE_SEGMENT + '_' + field.name for field in fields(Segment) ] + [PYANNOTE_TRACK] if annotation: annotation = annotation.copy() index = Index([s + (t, ) for s, t in annotation.itertracks()], name=names) else: annotation = Annotation(uri=uri, modality=modality) index = MultiIndex(levels=[list() for name in names], labels=[list() for name in names], names=names) self.annotation_ = annotation columns = None if labels is None else list(labels) data = None if values is None else np.array(values) dtype = np.float if values is None else values.dtype self.dataframe_ = DataFrame(data=data, dtype=dtype, index=index, columns=columns) self.hasChanged_ = True self.modality = modality self.uri = uri def copy(self) -> 'Scores': self._reindexIfNeeded() copied = self.__class__(uri=self.uri, modality=self.modality) copied.dataframe_ = self.dataframe_.copy() copied.annotation_ = self.annotation_.copy() copied.hasChanged_ = self.hasChanged_ return copied # del scores[segment] # del scores[segment, :] # del scores[segment, track] def __delitem__(self, key: Key): if isinstance(key, Segment): segment = key self.dataframe_.drop(tuple(segment), axis=0, inplace=True) del self.annotation_[segment] self.hasChanged_ = True elif isinstance(key, tuple) and len(key) == 2: segment, track = key self.dataframe_.drop(tuple(segment) + (track, ), axis=0, inplace=True) del self.annotation_[segment, track] self.hasChanged_ = True else: raise KeyError('') # value = scores[segment, track, label] def __getitem__(self, key): if len(key) == 2: key = (key[0], '_', key[1]) segment, track, label = key return self.dataframe_.at[tuple(segment) + (track, ), label] # scores[segment, track, label] = value # scores[segment, label] ==== scores[segment, '_', label] def __setitem__(self, key, value): if len(key) == 2: key = (key[0], '_', key[1]) segment, track, label = key # do not add empty track if not segment: return self.dataframe_.at[tuple(segment) + (track, ), label] = value self.annotation_[segment, track] = label self.hasChanged_ = True def __len__(self): """Number of annotated segments""" return len(self.annotation_) def __nonzero__(self): return self.__bool__() def __bool__(self): """False if annotation is empty""" return True if self.annotation_ else False def __contains__(self, included): """Check if segments are annotated Parameters ---------- included : `Segment` or `Timeline` Returns ------- contains : bool True if every segment in `included` is annotated, False otherwise. """ return included in self.annotation_ def __iter__(self): """Iterate over sorted segments""" return iter(self.annotation_.get_timeline(copy=False)) def __reversed__(self): """Reverse iterate over sorted segments""" return reversed(self.annotation_.get_timeline(copy=False)) def itersegments(self): return iter(self) def tracks(self, segment: Segment): """Set of tracks for query segment Parameters ---------- segment : `Segment` Query segment Returns ------- tracks : set Set of tracks for query segment """ return self.annotation_.get_tracks(segment) def has_track(self, segment: Segment, track): """Check whether a given track exists Parameters ---------- segment : `Segment` Query segment track : Query track Returns ------- exists : bool True if track exists for segment """ return self.annotation_.has_track(segment, track) def get_track_by_name(self, track: TrackName) -> List[Tuple[Segment]]: """Get all tracks with given name Parameters ---------- track : any valid track name Requested name track Returns ------- tracks : list List of (segment, track) tuples """ # WARNING: this doesn't call a valid class return self.annotation_.get_track_by_name(track) def new_track(self, segment: Segment, candidate: Optional[TrackName] = None, prefix: Optional[str] = None): """Track name generator Parameters ---------- segment : Segment candidate : any valid track name prefix : str, optional Returns ------- track : str New track name """ return self.annotation_.new_track(segment, candidate=None, prefix=None) def itertracks(self): """Iterate over annotation as (segment, track) tuple""" return self.annotation_.itertracks() def itervalues(self): """Iterate over scores as (segment, track, label, value) tuple""" # make sure segment/track pairs are sorted self._reindexIfNeeded() labels = self.labels() # yield one (segment, track, label) tuple per loop for index, columns in self.dataframe_.iterrows(): segment = Segment(*index[:-1]) track = index[-1] for label in labels: value = columns[label] if not np.isnan(value): yield segment, track, label, value def get_track_scores(self, segment: Segment, track): """Get all scores for a given track. Parameters ---------- segment : Segment track : hashable segment, track must be a valid track Returns ------- scores : dict {label: score} dictionary """ return dict(self.dataframe_.xs(tuple(segment) + (track, ))) def labels(self) -> List[Label]: """List of labels Returns ------- labels : list Sorted list of existing labels Remarks ------- Labels are sorted based on their string representation. """ return sorted(self.dataframe_.columns, key=str) def _reindexIfNeeded(self): if not self.hasChanged_: return names = [ PYANNOTE_SEGMENT + '_' + field.name for field in fields(Segment) ] + [PYANNOTE_TRACK] new_index = Index( [astuple(s) + (t, ) for s, t in self.annotation_.itertracks()], name=names) self.dataframe_ = self.dataframe_.reindex(new_index) self.hasChanged_ = False return def rename_tracks(self, generator: LabelGenerator = 'int'): """Rename tracks""" self._reindexIfNeeded() retracked = self.copy() annotation = self.annotation_.rename_tracks(generator=generator) retracked.annotation_ = annotation names = [ PYANNOTE_SEGMENT + '_' + field.name for field in fields(Segment) ] + [PYANNOTE_TRACK] new_index = Index( [astuple(s) + (t, ) for s, t in annotation.itertracks()], name=names) retracked.dataframe_.index = new_index return retracked def apply(self, func: Callable, axis=0): applied = self.copy() applied.dataframe_ = self.dataframe_.apply(func, axis=axis) applied.hasChanged_ = True return applied def rank(self, ascending: bool = False): """ Parameters ---------- ascending : boolean, default False False for ranks by high (0) to low (N-1) Returns ------- rank : `Scores` """ ranked = self.copy() ranked.dataframe_ = -1 + self.dataframe_.rank(axis=1, ascending=ascending) ranked.hasChanged_ = True return ranked def nbest(self, n: int, ascending: bool = False): """ Parameters ---------- n : int Size of n-best list ascending : boolean, default False False for ranks by high (0) to low (N-1) Returns ------- nbest : `Scores` New scores where only n-best are kept. """ filtered = self.copy() ranked_ = -1 + self.dataframe_.rank(axis=1, ascending=ascending) filtered.dataframe_ = filtered.dataframe_.where(ranked_ < n, other=np.NaN) filtered.hasChanged_ = True return filtered def subset(self, labels: Set[Label], invert: bool = False): """Scores subset Extract scores subset based on labels Parameters ---------- labels : set Set of labels invert : bool, optional If invert is True, extract all but requested `labels` Returns ------- subset : `Scores` Scores subset. """ self._reindexIfNeeded() if not isinstance(labels, set): raise TypeError('labels must be provided as a set of labels.') if invert: labels = set(self.labels()) - labels else: labels = labels & set(self.labels()) subset = Scores(uri=self.uri, modality=self.modality) subset.annotation_ = self.annotation_ subset.dataframe_ = self.dataframe_[list(labels)] return subset def to_annotation(self, threshold: float = -np.inf, posterior: bool = False): """ Parameters ---------- threshold : float, optional Each track is annotated with the label with the highest score. Yet, if the latter is smaller than `threshold`, label is replaced with an `Unknown` instance. posterior : bool, optional If True, scores are posterior probabilities in open-set identification. If top model posterior is higher than unknown posterior, it is selected. Otherwise, label is replaced with an `Unknown` instance. """ if not self: return Annotation(uri=self.uri, modality=self.modality) best = self.nbest(1, ascending=False) large_enough = best.copy() if posterior: unknown_posterior = 1. - self.dataframe_.sum(axis=1) large_enough.dataframe_ = (((best.dataframe_.T > unknown_posterior) & (best.dataframe_.T > threshold)).T) else: large_enough.dataframe_ = ((best.dataframe_.T > threshold).T) large_enough.dataframe_.where(best.dataframe_.notnull(), inplace=True, other=np.NaN) annotation = Annotation(uri=self.uri, modality=self.modality) for segment, track, label, value in large_enough.itervalues(): label = label if value else Unknown() annotation[segment, track] = label return annotation def map(self, func: Callable): """Apply function to all values""" mapped = self.copy() mapped.dataframe_ = self.dataframe_.applymap(func) mapped.hasChanged_ = True return mapped def crop(self, focus: Support, mode: str = 'strict') -> Support: """Crop on focus Parameters ---------- focus : `Segment` or `Timeline` mode : {'strict', 'loose', 'intersection'} In 'strict' mode, only segments fully included in focus coverage are kept. In 'loose' mode, any intersecting segment is kept unchanged. In 'intersection' mode, only intersecting segments are kept and replaced by their actual intersection with the focus. Returns ------- cropped : same type as caller Cropped version of the caller containing only tracks matching the provided focus and mode. Remarks ------- In 'intersection' mode, the best is done to keep the track names unchanged. However, in some cases where two original segments are cropped into the same resulting segments, conflicting track names are modified to make sure no track is lost. """ if isinstance(focus, Segment): return self.crop(Timeline([focus], uri=self.uri), mode=mode) self._reindexIfNeeded() cropped = self.copy() if mode in ['strict', 'loose']: new_annotation = self.annotation_.crop(focus, mode=mode) keep = [ new_annotation.has_track(segment, track) for segment, track in self.itertracks() ] cropped.dataframe_ = self.dataframe_[keep] cropped.annotation_ = new_annotation cropped.hasChanged_ = True return cropped elif mode in ['intersection']: raise NotImplementedError('') # # two original segments might be cropped into the same resulting # # segment -- therefore, we keep track of the mapping # intersection, mapping = timeline.crop(coverage, # mode=mode, mapping=True) # # # create new empty annotation # A = self.__class__(uri=self.uri, modality=self.modality) # # for cropped in intersection: # for original in mapping[cropped]: # for track in self.tracks(original): # # try to use original track name (candidate) # # if it already exists, create a brand new one # new_track = A.new_track(cropped, candidate=track) # # copy each value, column by column # for label in self.dataframe_.columns: # value = self.dataframe_.get_value((original, track), # label) # A.dataframe_ = A.dataframe_.set_value((cropped, new_track), # label, value) # # return A def __str__(self): """Human-friendly representation""" if self: self._reindexIfNeeded() return str(self.dataframe_) else: return "" def _repr_png_(self): from .notebook import repr_scores return repr_scores(self)
ser4 = Series(['USA', 'Mexico', 'Canada'], index=[0, 5, 10]) ranger = range(15) ser5 = ser4.reindex(ranger, method='ffill') print("\n Series 4") print(ser4) print(ser5) # reindex dataframe dframe = DataFrame(randn(25).reshape(5, 5), index=['A', 'B', 'D', 'E', 'F'], columns=['c1', 'c2', 'c3', 'c4', 'c5']) print(print("\n DFrame ")) print(dframe) print(print("\n DFrame2 ")) dframe2 = dframe.reindex(['A', 'B', 'C', 'D', 'E', 'F']) print(dframe2) # reindex columns new_columns = ['col1', 'col2', 'col3', 'col4', 'col5', 'col6'] dframe3 = dframe2.reindex(columns=new_columns) print(print("\n DFrame3 ")) print(dframe3) # reindex in place print(print("\n DFrame ")) print(dframe) #dframe. #dframe.ix(['A', 'B', 'C', 'D', 'E', 'F'], new_columns]) #print (dframe)
def _apply_to_df(self, el: pd.DataFrame, *, axis, fill_value=None): if fill_value is None: fill_value = np.NaN return el.reindex(self.new_idx, axis=axis, fill_value=fill_value)
dup_labels.append(pd.Index(['add'])) dup_labels dup_labels.is_unique obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c']) obj obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e']) obj2 obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4]) obj3 obj3.reindex(range(6), method='ffill') #forward fill the values frame = DataFrame(np.arange(9).reshape(3,3), index = ['a', 'c', 'd'], columns = ['Ohio', 'Texas', 'California']) frame frame2 = frame.reindex(['a', 'b', 'c', 'd']) frame2 states = ['Texas', 'Utah', 'California'] frame.reindex(columns=states) frame.loc[['a', 'b', 'c', 'd'], states] frame.loc[:,states] obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e']) obj new_obj = obj.drop('c') new_obj obj.drop(['d', 'c']) data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'], columns=['one', 'two', 'three', 'four'])
##索引对象 frame2.index.name='index' frame2.columns.name='property' frame2.index[2]='t' # 不能更改索引,为了安全共享。 'gpa' in frame2.columns 'fou' in frame2.index #基本功能 ##重新索引 obj2 obj2.reindex(['a','c','d','t']) # 返回一个新的,obj2并没有改变 obj2.reindex(['a','b','c','d','t'],fill_value=0) # 填补缺失值# 填补缺失值 index must be monotonic increasing or decreasing frame2 frame2.reindex(columns=['name','gpa','year','university']) frame2.drop(['one','two'],axis=0) # 丢弃行,但axis是0,因为行的index是按列排列的!!!!!!!!!! arr3.shape np.mean(arr3,axis=1) #按行求均值 frame3=DataFrame(arr3) frame3.mean(axis=1) #按行求均值 frame2['one':'four'] frame2['gpa'] frame2[frame2['gpa'].astype(np.float)>3] # 转换数据类型 按条件删选 #算数运算 和数据对齐
def test_reindex_with_multi_index(self): # https://github.com/pandas-dev/pandas/issues/29896 # tests for reindexing a multi-indexed DataFrame with a new MultiIndex # # confirms that we can reindex a multi-indexed DataFrame with a new # MultiIndex object correctly when using no filling, backfilling, and # padding # # The DataFrame, `df`, used in this test is: # c # a b # -1 0 A # 1 B # 2 C # 3 D # 4 E # 5 F # 6 G # 0 0 A # 1 B # 2 C # 3 D # 4 E # 5 F # 6 G # 1 0 A # 1 B # 2 C # 3 D # 4 E # 5 F # 6 G # # and the other MultiIndex, `new_multi_index`, is: # 0: 0 0.5 # 1: 2.0 # 2: 5.0 # 3: 5.8 df = DataFrame( { "a": [-1] * 7 + [0] * 7 + [1] * 7, "b": list(range(7)) * 3, "c": ["A", "B", "C", "D", "E", "F", "G"] * 3, } ).set_index(["a", "b"]) new_index = [0.5, 2.0, 5.0, 5.8] new_multi_index = MultiIndex.from_product([[0], new_index], names=["a", "b"]) # reindexing w/o a `method` value reindexed = df.reindex(new_multi_index) expected = DataFrame( {"a": [0] * 4, "b": new_index, "c": [np.nan, "C", "F", np.nan]} ).set_index(["a", "b"]) tm.assert_frame_equal(expected, reindexed) # reindexing with backfilling expected = DataFrame( {"a": [0] * 4, "b": new_index, "c": ["B", "C", "F", "G"]} ).set_index(["a", "b"]) reindexed_with_backfilling = df.reindex(new_multi_index, method="bfill") tm.assert_frame_equal(expected, reindexed_with_backfilling) reindexed_with_backfilling = df.reindex(new_multi_index, method="backfill") tm.assert_frame_equal(expected, reindexed_with_backfilling) # reindexing with padding expected = DataFrame( {"a": [0] * 4, "b": new_index, "c": ["A", "C", "F", "F"]} ).set_index(["a", "b"]) reindexed_with_padding = df.reindex(new_multi_index, method="pad") tm.assert_frame_equal(expected, reindexed_with_padding) reindexed_with_padding = df.reindex(new_multi_index, method="ffill") tm.assert_frame_equal(expected, reindexed_with_padding)
obj3.reindex(range(6),method='ffill') # In[86]: frame=DataFrame(np.arange(9).reshape((3,3)),index=['a','c','d'],columns=['Beijing','Shanghai','Tongling']) # In[87]: frame # In[88]: frame2=frame.reindex(['a','b','c','d']) # In[89]: frame2 # In[90]: states=['Tongling','Shenzheng','Beijing'] # In[91]: frame.reindex(columns=states)
index=['Ohio', 'Texas', 'Colorado']) df2 = DataFrame(np.arange(12).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) print(df1) print(df2) print(df1 + df2) print() print("## Fill values:") df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd')) df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde')) print(df1) print(df2) print(df1.add(df2, fill_value=0)) print(df1.reindex(columns=df2.columns, fill_value=0)) print() print("## Operate between Series and DataFrame:") arr = np.arange(12.).reshape((3, 4)) print(arr) print(arr[0]) print(arr - arr[0]) frame = DataFrame(np.arange(12).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) series = frame.ix[0] print("frame:") print(frame) print("series:") print(series)
class TestJoin: def setup_method(self, method): # aggregate multiple columns self.df = DataFrame({ "key1": get_test_data(), "key2": get_test_data(), "data1": np.random.randn(N), "data2": np.random.randn(N), }) # exclude a couple keys for fun self.df = self.df[self.df["key2"] > 1] self.df2 = DataFrame({ "key1": get_test_data(n=N // 5), "key2": get_test_data(ngroups=NGROUPS // 2, n=N // 5), "value": np.random.randn(N // 5), }) index, data = tm.getMixedTypeDict() self.target = DataFrame(data, index=index) # Join on string value self.source = DataFrame({ "MergedA": data["A"], "MergedD": data["D"] }, index=data["C"]) def test_left_outer_join(self): joined_key2 = merge(self.df, self.df2, on="key2") _check_join(self.df, self.df2, joined_key2, ["key2"], how="left") joined_both = merge(self.df, self.df2) _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="left") def test_right_outer_join(self): joined_key2 = merge(self.df, self.df2, on="key2", how="right") _check_join(self.df, self.df2, joined_key2, ["key2"], how="right") joined_both = merge(self.df, self.df2, how="right") _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="right") def test_full_outer_join(self): joined_key2 = merge(self.df, self.df2, on="key2", how="outer") _check_join(self.df, self.df2, joined_key2, ["key2"], how="outer") joined_both = merge(self.df, self.df2, how="outer") _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="outer") def test_inner_join(self): joined_key2 = merge(self.df, self.df2, on="key2", how="inner") _check_join(self.df, self.df2, joined_key2, ["key2"], how="inner") joined_both = merge(self.df, self.df2, how="inner") _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="inner") def test_handle_overlap(self): joined = merge(self.df, self.df2, on="key2", suffixes=(".foo", ".bar")) assert "key1.foo" in joined assert "key1.bar" in joined def test_handle_overlap_arbitrary_key(self): joined = merge( self.df, self.df2, left_on="key2", right_on="key1", suffixes=(".foo", ".bar"), ) assert "key1.foo" in joined assert "key2.bar" in joined def test_join_on(self): target = self.target source = self.source merged = target.join(source, on="C") tm.assert_series_equal(merged["MergedA"], target["A"], check_names=False) tm.assert_series_equal(merged["MergedD"], target["D"], check_names=False) # join with duplicates (fix regression from DataFrame/Matrix merge) df = DataFrame({"key": ["a", "a", "b", "b", "c"]}) df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"]) joined = df.join(df2, on="key") expected = DataFrame({ "key": ["a", "a", "b", "b", "c"], "value": [0, 0, 1, 1, 2] }) tm.assert_frame_equal(joined, expected) # Test when some are missing df_a = DataFrame([[1], [2], [3]], index=["a", "b", "c"], columns=["one"]) df_b = DataFrame([["foo"], ["bar"]], index=[1, 2], columns=["two"]) df_c = DataFrame([[1], [2]], index=[1, 2], columns=["three"]) joined = df_a.join(df_b, on="one") joined = joined.join(df_c, on="one") assert np.isnan(joined["two"]["c"]) assert np.isnan(joined["three"]["c"]) # merge column not p resent with pytest.raises(KeyError, match="^'E'$"): target.join(source, on="E") # overlap source_copy = source.copy() source_copy["A"] = 0 msg = ("You are trying to merge on float64 and object columns. If " "you wish to proceed you should use pd.concat") with pytest.raises(ValueError, match=msg): target.join(source_copy, on="A") def test_join_on_fails_with_different_right_index(self): df = DataFrame({ "a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3) }) df2 = DataFrame( { "a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10) }, index=tm.makeCustomIndex(10, 2), ) msg = r'len\(left_on\) must equal the number of levels in the index of "right"' with pytest.raises(ValueError, match=msg): merge(df, df2, left_on="a", right_index=True) def test_join_on_fails_with_different_left_index(self): df = DataFrame( { "a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3) }, index=tm.makeCustomIndex(3, 2), ) df2 = DataFrame({ "a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10) }) msg = r'len\(right_on\) must equal the number of levels in the index of "left"' with pytest.raises(ValueError, match=msg): merge(df, df2, right_on="b", left_index=True) def test_join_on_fails_with_different_column_counts(self): df = DataFrame({ "a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3) }) df2 = DataFrame( { "a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10) }, index=tm.makeCustomIndex(10, 2), ) msg = r"len\(right_on\) must equal len\(left_on\)" with pytest.raises(ValueError, match=msg): merge(df, df2, right_on="a", left_on=["a", "b"]) @pytest.mark.parametrize("wrong_type", [2, "str", None, np.array([0, 1])]) def test_join_on_fails_with_wrong_object_type(self, wrong_type): # GH12081 - original issue # GH21220 - merging of Series and DataFrame is now allowed # Edited test to remove the Series object from test parameters df = DataFrame({"a": [1, 1]}) msg = ("Can only merge Series or DataFrame objects, " f"a {type(wrong_type)} was passed") with pytest.raises(TypeError, match=msg): merge(wrong_type, df, left_on="a", right_on="a") with pytest.raises(TypeError, match=msg): merge(df, wrong_type, left_on="a", right_on="a") def test_join_on_pass_vector(self): expected = self.target.join(self.source, on="C") del expected["C"] join_col = self.target.pop("C") result = self.target.join(self.source, on=join_col) tm.assert_frame_equal(result, expected) def test_join_with_len0(self): # nothing to merge merged = self.target.join(self.source.reindex([]), on="C") for col in self.source: assert col in merged assert merged[col].isna().all() merged2 = self.target.join(self.source.reindex([]), on="C", how="inner") tm.assert_index_equal(merged2.columns, merged.columns) assert len(merged2) == 0 def test_join_on_inner(self): df = DataFrame({"key": ["a", "a", "d", "b", "b", "c"]}) df2 = DataFrame({"value": [0, 1]}, index=["a", "b"]) joined = df.join(df2, on="key", how="inner") expected = df.join(df2, on="key") expected = expected[expected["value"].notna()] tm.assert_series_equal(joined["key"], expected["key"]) tm.assert_series_equal(joined["value"], expected["value"], check_dtype=False) tm.assert_index_equal(joined.index, expected.index) def test_join_on_singlekey_list(self): df = DataFrame({"key": ["a", "a", "b", "b", "c"]}) df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"]) # corner cases joined = df.join(df2, on=["key"]) expected = df.join(df2, on="key") tm.assert_frame_equal(joined, expected) def test_join_on_series(self): result = self.target.join(self.source["MergedA"], on="C") expected = self.target.join(self.source[["MergedA"]], on="C") tm.assert_frame_equal(result, expected) def test_join_on_series_buglet(self): # GH #638 df = DataFrame({"a": [1, 1]}) ds = Series([2], index=[1], name="b") result = df.join(ds, on="a") expected = DataFrame({"a": [1, 1], "b": [2, 2]}, index=df.index) tm.assert_frame_equal(result, expected) def test_join_index_mixed(self, join_type): # no overlapping blocks df1 = DataFrame(index=np.arange(10)) df1["bool"] = True df1["string"] = "foo" df2 = DataFrame(index=np.arange(5, 15)) df2["int"] = 1 df2["float"] = 1.0 joined = df1.join(df2, how=join_type) expected = _join_by_hand(df1, df2, how=join_type) tm.assert_frame_equal(joined, expected) joined = df2.join(df1, how=join_type) expected = _join_by_hand(df2, df1, how=join_type) tm.assert_frame_equal(joined, expected) def test_join_index_mixed_overlap(self): df1 = DataFrame( { "A": 1.0, "B": 2, "C": "foo", "D": True }, index=np.arange(10), columns=["A", "B", "C", "D"], ) assert df1["B"].dtype == np.int64 assert df1["D"].dtype == np.bool_ df2 = DataFrame( { "A": 1.0, "B": 2, "C": "foo", "D": True }, index=np.arange(0, 10, 2), columns=["A", "B", "C", "D"], ) # overlap joined = df1.join(df2, lsuffix="_one", rsuffix="_two") expected_columns = [ "A_one", "B_one", "C_one", "D_one", "A_two", "B_two", "C_two", "D_two", ] df1.columns = expected_columns[:4] df2.columns = expected_columns[4:] expected = _join_by_hand(df1, df2) tm.assert_frame_equal(joined, expected) def test_join_empty_bug(self): # generated an exception in 0.4.3 x = DataFrame() x.join(DataFrame([3], index=[0], columns=["A"]), how="outer") def test_join_unconsolidated(self): # GH #331 a = DataFrame(np.random.randn(30, 2), columns=["a", "b"]) c = Series(np.random.randn(30)) a["c"] = c d = DataFrame(np.random.randn(30, 1), columns=["q"]) # it works! a.join(d) d.join(a) def test_join_multiindex(self): index1 = MultiIndex.from_arrays( [["a", "a", "a", "b", "b", "b"], [1, 2, 3, 1, 2, 3]], names=["first", "second"], ) index2 = MultiIndex.from_arrays( [["b", "b", "b", "c", "c", "c"], [1, 2, 3, 1, 2, 3]], names=["first", "second"], ) df1 = DataFrame(data=np.random.randn(6), index=index1, columns=["var X"]) df2 = DataFrame(data=np.random.randn(6), index=index2, columns=["var Y"]) df1 = df1.sort_index(level=0) df2 = df2.sort_index(level=0) joined = df1.join(df2, how="outer") ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names tm.assert_frame_equal(joined, expected) assert joined.index.names == index1.names df1 = df1.sort_index(level=1) df2 = df2.sort_index(level=1) joined = df1.join(df2, how="outer").sort_index(level=0) ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names tm.assert_frame_equal(joined, expected) assert joined.index.names == index1.names def test_join_inner_multiindex(self): key1 = [ "bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap" ] key2 = [ "two", "one", "three", "one", "two", "one", "two", "two", "three", "one", ] data = np.random.randn(len(key1)) data = DataFrame({"key1": key1, "key2": key2, "data": data}) index = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["first", "second"], ) to_join = DataFrame(np.random.randn(10, 3), index=index, columns=["j_one", "j_two", "j_three"]) joined = data.join(to_join, on=["key1", "key2"], how="inner") expected = merge( data, to_join.reset_index(), left_on=["key1", "key2"], right_on=["first", "second"], how="inner", sort=False, ) expected2 = merge( to_join, data, right_on=["key1", "key2"], left_index=True, how="inner", sort=False, ) tm.assert_frame_equal(joined, expected2.reindex_like(joined)) expected2 = merge( to_join, data, right_on=["key1", "key2"], left_index=True, how="inner", sort=False, ) expected = expected.drop(["first", "second"], axis=1) expected.index = joined.index assert joined.index.is_monotonic tm.assert_frame_equal(joined, expected) # _assert_same_contents(expected, expected2.loc[:, expected.columns]) def test_join_hierarchical_mixed(self): # GH 2024 df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "c"]) new_df = df.groupby(["a"]).agg({"b": [np.mean, np.sum]}) other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=["a", "b", "d"]) other_df.set_index("a", inplace=True) # GH 9455, 12219 with tm.assert_produces_warning(UserWarning): result = merge(new_df, other_df, left_index=True, right_index=True) assert ("b", "mean") in result assert "b" in result def test_join_float64_float32(self): a = DataFrame(np.random.randn(10, 2), columns=["a", "b"], dtype=np.float64) b = DataFrame(np.random.randn(10, 1), columns=["c"], dtype=np.float32) joined = a.join(b) assert joined.dtypes["a"] == "float64" assert joined.dtypes["b"] == "float64" assert joined.dtypes["c"] == "float32" a = np.random.randint(0, 5, 100).astype("int64") b = np.random.random(100).astype("float64") c = np.random.random(100).astype("float32") df = DataFrame({"a": a, "b": b, "c": c}) xpdf = DataFrame({"a": a, "b": b, "c": c}) s = DataFrame(np.random.random(5).astype("float32"), columns=["md"]) rs = df.merge(s, left_on="a", right_index=True) assert rs.dtypes["a"] == "int64" assert rs.dtypes["b"] == "float64" assert rs.dtypes["c"] == "float32" assert rs.dtypes["md"] == "float32" xp = xpdf.merge(s, left_on="a", right_index=True) tm.assert_frame_equal(rs, xp) def test_join_many_non_unique_index(self): df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]}) df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]}) df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]}) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how="outer") df_partially_merged = merge(df1, df2, on=["a", "b"], how="outer") expected = merge(df_partially_merged, df3, on=["a", "b"], how="outer") result = result.reset_index() expected = expected[result.columns] expected["a"] = expected.a.astype("int64") expected["b"] = expected.b.astype("int64") tm.assert_frame_equal(result, expected) df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) df3 = DataFrame({ "a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000] }) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how="inner") df_partially_merged = merge(df1, df2, on=["a", "b"], how="inner") expected = merge(df_partially_merged, df3, on=["a", "b"], how="inner") result = result.reset_index() tm.assert_frame_equal(result, expected.loc[:, result.columns]) # GH 11519 df = DataFrame({ "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], "B": ["one", "one", "two", "three", "two", "two", "one", "three"], "C": np.random.randn(8), "D": np.random.randn(8), }) s = Series(np.repeat(np.arange(8), 2), index=np.repeat(np.arange(8), 2), name="TEST") inner = df.join(s, how="inner") outer = df.join(s, how="outer") left = df.join(s, how="left") right = df.join(s, how="right") tm.assert_frame_equal(inner, outer) tm.assert_frame_equal(inner, left) tm.assert_frame_equal(inner, right) def test_join_sort(self): left = DataFrame({ "key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4] }) right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"]) joined = left.join(right, on="key", sort=True) expected = DataFrame( { "key": ["bar", "baz", "foo", "foo"], "value": [2, 3, 1, 4], "value2": ["a", "b", "c", "c"], }, index=[1, 2, 0, 3], ) tm.assert_frame_equal(joined, expected) # smoke test joined = left.join(right, on="key", sort=False) tm.assert_index_equal(joined.index, Index(range(4)), exact=True) def test_join_mixed_non_unique_index(self): # GH 12814, unorderable types in py3 with a non-unique index df1 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 3, "a"]) df2 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 3, 3, 4]) result = df1.join(df2) expected = DataFrame( { "a": [1, 2, 3, 3, 4], "b": [5, np.nan, 6, 7, np.nan] }, index=[1, 2, 3, 3, "a"], ) tm.assert_frame_equal(result, expected) df3 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 2, "a"]) df4 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 2, 3, 4]) result = df3.join(df4) expected = DataFrame({ "a": [1, 2, 3, 4], "b": [5, 6, 6, np.nan] }, index=[1, 2, 2, "a"]) tm.assert_frame_equal(result, expected) def test_join_non_unique_period_index(self): # GH #16871 index = pd.period_range("2016-01-01", periods=16, freq="M") df = DataFrame(list(range(len(index))), index=index, columns=["pnum"]) df2 = concat([df, df]) result = df.join(df2, how="inner", rsuffix="_df2") expected = DataFrame( np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2), columns=["pnum", "pnum_df2"], index=df2.sort_index().index, ) tm.assert_frame_equal(result, expected) def test_mixed_type_join_with_suffix(self): # GH #916 df = DataFrame(np.random.randn(20, 6), columns=["a", "b", "c", "d", "e", "f"]) df.insert(0, "id", 0) df.insert(5, "dt", "foo") grouped = df.groupby("id") mn = grouped.mean() cn = grouped.count() # it works! mn.join(cn, rsuffix="_right") def test_join_many(self): df = DataFrame(np.random.randn(10, 6), columns=list("abcdef")) df_list = [df[["a", "b"]], df[["c", "d"]], df[["e", "f"]]] joined = df_list[0].join(df_list[1:]) tm.assert_frame_equal(joined, df) df_list = [ df[["a", "b"]][:-2], df[["c", "d"]][2:], df[["e", "f"]][1:9] ] def _check_diff_index(df_list, result, exp_index): reindexed = [x.reindex(exp_index) for x in df_list] expected = reindexed[0].join(reindexed[1:]) tm.assert_frame_equal(result, expected) # different join types joined = df_list[0].join(df_list[1:], how="outer") _check_diff_index(df_list, joined, df.index) joined = df_list[0].join(df_list[1:]) _check_diff_index(df_list, joined, df_list[0].index) joined = df_list[0].join(df_list[1:], how="inner") _check_diff_index(df_list, joined, df.index[2:8]) msg = "Joining multiple DataFrames only supported for joining on index" with pytest.raises(ValueError, match=msg): df_list[0].join(df_list[1:], on="a") def test_join_many_mixed(self): df = DataFrame(np.random.randn(8, 4), columns=["A", "B", "C", "D"]) df["key"] = ["foo", "bar"] * 4 df1 = df.loc[:, ["A", "B"]] df2 = df.loc[:, ["C", "D"]] df3 = df.loc[:, ["key"]] result = df1.join([df2, df3]) tm.assert_frame_equal(result, df) def test_join_dups(self): # joining dups df = concat( [ DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B" ]), DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"]), ], axis=1, ) expected = concat([df, df], axis=1) result = df.join(df, rsuffix="_2") result.columns = expected.columns tm.assert_frame_equal(result, expected) # GH 4975, invalid join on dups w = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) x = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) y = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) dta = x.merge(y, left_index=True, right_index=True).merge(z, left_index=True, right_index=True, how="outer") dta = dta.merge(w, left_index=True, right_index=True) expected = concat([x, y, z, w], axis=1) expected.columns = [ "x_x", "y_x", "x_y", "y_y", "x_x", "y_x", "x_y", "y_y" ] tm.assert_frame_equal(dta, expected) def test_join_multi_to_multi(self, join_type): # GH 20475 leftindex = MultiIndex.from_product( [list("abc"), list("xy"), [1, 2]], names=["abc", "xy", "num"]) left = DataFrame({"v1": range(12)}, index=leftindex) rightindex = MultiIndex.from_product( [list("abc"), list("xy")], names=["abc", "xy"]) right = DataFrame({"v2": [100 * i for i in range(1, 7)]}, index=rightindex) result = left.join(right, on=["abc", "xy"], how=join_type) expected = (left.reset_index().merge(right.reset_index(), on=["abc", "xy"], how=join_type).set_index( ["abc", "xy", "num"])) tm.assert_frame_equal(expected, result) msg = r'len\(left_on\) must equal the number of levels in the index of "right"' with pytest.raises(ValueError, match=msg): left.join(right, on="xy", how=join_type) with pytest.raises(ValueError, match=msg): right.join(left, on=["abc", "xy"], how=join_type) def test_join_on_tz_aware_datetimeindex(self): # GH 23931, 26335 df1 = DataFrame({ "date": pd.date_range(start="2018-01-01", periods=5, tz="America/Chicago"), "vals": list("abcde"), }) df2 = DataFrame({ "date": pd.date_range(start="2018-01-03", periods=5, tz="America/Chicago"), "vals_2": list("tuvwx"), }) result = df1.join(df2.set_index("date"), on="date") expected = df1.copy() expected["vals_2"] = Series([np.nan] * 2 + list("tuv"), dtype=object) tm.assert_frame_equal(result, expected) def test_join_datetime_string(self): # GH 5647 dfa = DataFrame( [ ["2012-08-02", "L", 10], ["2012-08-02", "J", 15], ["2013-04-06", "L", 20], ["2013-04-06", "J", 25], ], columns=["x", "y", "a"], ) dfa["x"] = pd.to_datetime(dfa["x"]) dfb = DataFrame( [["2012-08-02", "J", 1], ["2013-04-06", "L", 2]], columns=["x", "y", "z"], index=[2, 4], ) dfb["x"] = pd.to_datetime(dfb["x"]) result = dfb.join(dfa.set_index(["x", "y"]), on=["x", "y"]) expected = DataFrame( [ [Timestamp("2012-08-02 00:00:00"), "J", 1, 15], [Timestamp("2013-04-06 00:00:00"), "L", 2, 20], ], index=[2, 4], columns=["x", "y", "z", "a"], ) tm.assert_frame_equal(result, expected)
def test_reindex_axis_style_raises(self): # https://github.com/pandas-dev/pandas/issues/12392 df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) with pytest.raises(TypeError, match="Cannot specify both 'axis'"): df.reindex([0, 1], ["A"], axis=1) with pytest.raises(TypeError, match="Cannot specify both 'axis'"): df.reindex([0, 1], ["A"], axis="index") with pytest.raises(TypeError, match="Cannot specify both 'axis'"): df.reindex(index=[0, 1], axis="index") with pytest.raises(TypeError, match="Cannot specify both 'axis'"): df.reindex(index=[0, 1], axis="columns") with pytest.raises(TypeError, match="Cannot specify both 'axis'"): df.reindex(columns=[0, 1], axis="columns") with pytest.raises(TypeError, match="Cannot specify both 'axis'"): df.reindex(index=[0, 1], columns=[0, 1], axis="columns") with pytest.raises(TypeError, match="Cannot specify all"): df.reindex([0, 1], [0], ["A"]) # Mixing styles with pytest.raises(TypeError, match="Cannot specify both 'axis'"): df.reindex(index=[0, 1], axis="index") with pytest.raises(TypeError, match="Cannot specify both 'axis'"): df.reindex(index=[0, 1], axis="columns") # Duplicates with pytest.raises(TypeError, match="multiple values"): df.reindex([0, 1], labels=[0, 1])
serie_2 = serie_1.reindex(["a", "b", "c", "d", "e", "f", "g", "h"], fill_value=0) print(serie_2) serie_3 = Series(["Santa Catarina", "Santo André", "Santo Antônio"], index=[0, 5, 8]) print(serie_3) index_range = range(15) serie_4 = serie_3.reindex(index_range, method="ffill") print(serie_4) serie_4 = serie_3.reindex(index_range, method="bfill") print(serie_4) serie_4 = serie_3.reindex(index_range, method="nearest") print(serie_4) data_frame = DataFrame(np.random.randn(25).reshape((5, 5)), index=["a", "b", "d", "e", "f"], columns=["col_1", "col_2", "col_3", "col_4", "col_5"]) print(data_frame) data_frame_2 = data_frame.reindex( columns=["col_1", "col_2", "col_3", "col_4", "col_5", "col_6"], fill_value=5) print(data_frame_2) print()
def test_observed(observed): # multiple groupers, don't re-expand the output space # of the grouper # gh-14942 (implement) # gh-10132 (back-compat) # gh-8138 (back-compat) # gh-8869 cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) df['C'] = ['foo', 'bar'] * 2 # multiple groupers with a non-cat gb = df.groupby(['A', 'B', 'C'], observed=observed) exp_index = pd.MultiIndex.from_arrays([cat1, cat2, ['foo', 'bar'] * 2], names=['A', 'B', 'C']) expected = DataFrame({ 'values': Series([1, 2, 3, 4], index=exp_index) }).sort_index() result = gb.sum() if not observed: expected = cartesian_product_for_groupers(expected, [cat1, cat2, ['foo', 'bar']], list('ABC')) tm.assert_frame_equal(result, expected) gb = df.groupby(['A', 'B'], observed=observed) exp_index = pd.MultiIndex.from_arrays([cat1, cat2], names=['A', 'B']) expected = DataFrame({'values': [1, 2, 3, 4]}, index=exp_index) result = gb.sum() if not observed: expected = cartesian_product_for_groupers(expected, [cat1, cat2], list('AB')) tm.assert_frame_equal(result, expected) # https://github.com/pandas-dev/pandas/issues/8138 d = { 'cat': pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], ordered=True), 'ints': [1, 1, 2, 2], 'val': [10, 20, 30, 40] } df = pd.DataFrame(d) # Grouping on a single column groups_single_key = df.groupby("cat", observed=observed) result = groups_single_key.mean() exp_index = pd.CategoricalIndex(list('ab'), name="cat", categories=list('abc'), ordered=True) expected = DataFrame({ "ints": [1.5, 1.5], "val": [20., 30] }, index=exp_index) if not observed: index = pd.CategoricalIndex(list('abc'), name="cat", categories=list('abc'), ordered=True) expected = expected.reindex(index) tm.assert_frame_equal(result, expected) # Grouping on two columns groups_double_key = df.groupby(["cat", "ints"], observed=observed) result = groups_double_key.agg('mean') expected = DataFrame({ "val": [10, 30, 20, 40], "cat": pd.Categorical(['a', 'a', 'b', 'b'], categories=['a', 'b', 'c'], ordered=True), "ints": [1, 2, 1, 2] }).set_index(["cat", "ints"]) if not observed: expected = cartesian_product_for_groupers(expected, [df.cat.values, [1, 2]], ['cat', 'ints']) tm.assert_frame_equal(result, expected) # GH 10132 for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: c, i = key result = groups_double_key.get_group(key) expected = df[(df.cat == c) & (df.ints == i)] assert_frame_equal(result, expected) # gh-8869 # with as_index d = { 'foo': [10, 8, 4, 8, 4, 1, 1], 'bar': [10, 20, 30, 40, 50, 60, 70], 'baz': ['d', 'c', 'e', 'a', 'a', 'd', 'c'] } df = pd.DataFrame(d) cat = pd.cut(df['foo'], np.linspace(0, 10, 3)) df['range'] = cat groups = df.groupby(['range', 'baz'], as_index=False, observed=observed) result = groups.agg('mean') groups2 = df.groupby(['range', 'baz'], as_index=True, observed=observed) expected = groups2.agg('mean').reset_index() tm.assert_frame_equal(result, expected)
class TestJoin(object): def setup_method(self, method): # aggregate multiple columns self.df = DataFrame({ 'key1': get_test_data(), 'key2': get_test_data(), 'data1': np.random.randn(N), 'data2': np.random.randn(N) }) # exclude a couple keys for fun self.df = self.df[self.df['key2'] > 1] self.df2 = DataFrame({ 'key1': get_test_data(n=N // 5), 'key2': get_test_data(ngroups=NGROUPS // 2, n=N // 5), 'value': np.random.randn(N // 5) }) index, data = tm.getMixedTypeDict() self.target = DataFrame(data, index=index) # Join on string value self.source = DataFrame({ 'MergedA': data['A'], 'MergedD': data['D'] }, index=data['C']) def test_cython_left_outer_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) max_group = 5 ls, rs = libjoin.left_outer_join(left, right, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') exp_li = a_( [0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10]) exp_ri = a_( [0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_cython_right_outer_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) max_group = 5 rs, ls = libjoin.left_outer_join(right, left, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') # 0 1 1 1 exp_li = a_([ 0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5, # 2 2 4 6, 7, 8, 6, 7, 8, -1 ]) exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_cython_inner_join(self): left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) max_group = 5 ls, rs = libjoin.inner_join(left, right, max_group) exp_ls = left.argsort(kind='mergesort') exp_rs = right.argsort(kind='mergesort') exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8]) exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5]) exp_ls = exp_ls.take(exp_li) exp_ls[exp_li == -1] = -1 exp_rs = exp_rs.take(exp_ri) exp_rs[exp_ri == -1] = -1 tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) def test_left_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2') _check_join(self.df, self.df2, joined_key2, ['key2'], how='left') joined_both = merge(self.df, self.df2) _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='left') def test_right_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='right') _check_join(self.df, self.df2, joined_key2, ['key2'], how='right') joined_both = merge(self.df, self.df2, how='right') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='right') def test_full_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='outer') _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer') joined_both = merge(self.df, self.df2, how='outer') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='outer') def test_inner_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='inner') _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner') joined_both = merge(self.df, self.df2, how='inner') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='inner') def test_handle_overlap(self): joined = merge(self.df, self.df2, on='key2', suffixes=['.foo', '.bar']) assert 'key1.foo' in joined assert 'key1.bar' in joined def test_handle_overlap_arbitrary_key(self): joined = merge(self.df, self.df2, left_on='key2', right_on='key1', suffixes=['.foo', '.bar']) assert 'key1.foo' in joined assert 'key2.bar' in joined def test_join_on(self): target = self.target source = self.source merged = target.join(source, on='C') tm.assert_series_equal(merged['MergedA'], target['A'], check_names=False) tm.assert_series_equal(merged['MergedD'], target['D'], check_names=False) # join with duplicates (fix regression from DataFrame/Matrix merge) df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) joined = df.join(df2, on='key') expected = DataFrame({ 'key': ['a', 'a', 'b', 'b', 'c'], 'value': [0, 0, 1, 1, 2] }) assert_frame_equal(joined, expected) # Test when some are missing df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'], columns=['one']) df_b = DataFrame([['foo'], ['bar']], index=[1, 2], columns=['two']) df_c = DataFrame([[1], [2]], index=[1, 2], columns=['three']) joined = df_a.join(df_b, on='one') joined = joined.join(df_c, on='one') assert np.isnan(joined['two']['c']) assert np.isnan(joined['three']['c']) # merge column not p resent pytest.raises(KeyError, target.join, source, on='E') # overlap source_copy = source.copy() source_copy['A'] = 0 pytest.raises(ValueError, target.join, source_copy, on='A') def test_join_on_fails_with_different_right_index(self): with pytest.raises(ValueError): df = DataFrame({ 'a': np.random.choice(['m', 'f'], size=3), 'b': np.random.randn(3) }) df2 = DataFrame( { 'a': np.random.choice(['m', 'f'], size=10), 'b': np.random.randn(10) }, index=tm.makeCustomIndex(10, 2)) merge(df, df2, left_on='a', right_index=True) def test_join_on_fails_with_different_left_index(self): with pytest.raises(ValueError): df = DataFrame( { 'a': np.random.choice(['m', 'f'], size=3), 'b': np.random.randn(3) }, index=tm.makeCustomIndex(10, 2)) df2 = DataFrame({ 'a': np.random.choice(['m', 'f'], size=10), 'b': np.random.randn(10) }) merge(df, df2, right_on='b', left_index=True) def test_join_on_fails_with_different_column_counts(self): with pytest.raises(ValueError): df = DataFrame({ 'a': np.random.choice(['m', 'f'], size=3), 'b': np.random.randn(3) }) df2 = DataFrame( { 'a': np.random.choice(['m', 'f'], size=10), 'b': np.random.randn(10) }, index=tm.makeCustomIndex(10, 2)) merge(df, df2, right_on='a', left_on=['a', 'b']) @pytest.mark.parametrize("wrong_type", [2, 'str', None, np.array([0, 1])]) def test_join_on_fails_with_wrong_object_type(self, wrong_type): # GH12081 - original issue # GH21220 - merging of Series and DataFrame is now allowed # Edited test to remove the Series object from test parameters df = DataFrame({'a': [1, 1]}) with pytest.raises(TypeError, match=str(type(wrong_type))): merge(wrong_type, df, left_on='a', right_on='a') with pytest.raises(TypeError, match=str(type(wrong_type))): merge(df, wrong_type, left_on='a', right_on='a') def test_join_on_pass_vector(self): expected = self.target.join(self.source, on='C') del expected['C'] join_col = self.target.pop('C') result = self.target.join(self.source, on=join_col) assert_frame_equal(result, expected) def test_join_with_len0(self): # nothing to merge merged = self.target.join(self.source.reindex([]), on='C') for col in self.source: assert col in merged assert merged[col].isna().all() merged2 = self.target.join(self.source.reindex([]), on='C', how='inner') tm.assert_index_equal(merged2.columns, merged.columns) assert len(merged2) == 0 def test_join_on_inner(self): df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']}) df2 = DataFrame({'value': [0, 1]}, index=['a', 'b']) joined = df.join(df2, on='key', how='inner') expected = df.join(df2, on='key') expected = expected[expected['value'].notna()] tm.assert_series_equal(joined['key'], expected['key'], check_dtype=False) tm.assert_series_equal(joined['value'], expected['value'], check_dtype=False) tm.assert_index_equal(joined.index, expected.index) def test_join_on_singlekey_list(self): df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) # corner cases joined = df.join(df2, on=['key']) expected = df.join(df2, on='key') assert_frame_equal(joined, expected) def test_join_on_series(self): result = self.target.join(self.source['MergedA'], on='C') expected = self.target.join(self.source[['MergedA']], on='C') assert_frame_equal(result, expected) def test_join_on_series_buglet(self): # GH #638 df = DataFrame({'a': [1, 1]}) ds = Series([2], index=[1], name='b') result = df.join(ds, on='a') expected = DataFrame({'a': [1, 1], 'b': [2, 2]}, index=df.index) tm.assert_frame_equal(result, expected) def test_join_index_mixed(self, join_type): # no overlapping blocks df1 = DataFrame(index=np.arange(10)) df1['bool'] = True df1['string'] = 'foo' df2 = DataFrame(index=np.arange(5, 15)) df2['int'] = 1 df2['float'] = 1. joined = df1.join(df2, how=join_type) expected = _join_by_hand(df1, df2, how=join_type) assert_frame_equal(joined, expected) joined = df2.join(df1, how=join_type) expected = _join_by_hand(df2, df1, how=join_type) assert_frame_equal(joined, expected) def test_join_index_mixed_overlap(self): df1 = DataFrame({ 'A': 1., 'B': 2, 'C': 'foo', 'D': True }, index=np.arange(10), columns=['A', 'B', 'C', 'D']) assert df1['B'].dtype == np.int64 assert df1['D'].dtype == np.bool_ df2 = DataFrame({ 'A': 1., 'B': 2, 'C': 'foo', 'D': True }, index=np.arange(0, 10, 2), columns=['A', 'B', 'C', 'D']) # overlap joined = df1.join(df2, lsuffix='_one', rsuffix='_two') expected_columns = [ 'A_one', 'B_one', 'C_one', 'D_one', 'A_two', 'B_two', 'C_two', 'D_two' ] df1.columns = expected_columns[:4] df2.columns = expected_columns[4:] expected = _join_by_hand(df1, df2) assert_frame_equal(joined, expected) def test_join_empty_bug(self): # generated an exception in 0.4.3 x = DataFrame() x.join(DataFrame([3], index=[0], columns=['A']), how='outer') def test_join_unconsolidated(self): # GH #331 a = DataFrame(randn(30, 2), columns=['a', 'b']) c = Series(randn(30)) a['c'] = c d = DataFrame(randn(30, 1), columns=['q']) # it works! a.join(d) d.join(a) def test_join_multiindex(self): index1 = MultiIndex.from_arrays( [['a', 'a', 'a', 'b', 'b', 'b'], [1, 2, 3, 1, 2, 3]], names=['first', 'second']) index2 = MultiIndex.from_arrays( [['b', 'b', 'b', 'c', 'c', 'c'], [1, 2, 3, 1, 2, 3]], names=['first', 'second']) df1 = DataFrame(data=np.random.randn(6), index=index1, columns=['var X']) df2 = DataFrame(data=np.random.randn(6), index=index2, columns=['var Y']) df1 = df1.sort_index(level=0) df2 = df2.sort_index(level=0) joined = df1.join(df2, how='outer') ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names assert_frame_equal(joined, expected) assert joined.index.names == index1.names df1 = df1.sort_index(level=1) df2 = df2.sort_index(level=1) joined = df1.join(df2, how='outer').sort_index(level=0) ex_index = Index(index1.values).union(Index(index2.values)) expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) expected.index.names = index1.names assert_frame_equal(joined, expected) assert joined.index.names == index1.names def test_join_inner_multiindex(self): key1 = [ 'bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', 'qux', 'snap' ] key2 = [ 'two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', 'three', 'one' ] data = np.random.randn(len(key1)) data = DataFrame({'key1': key1, 'key2': key2, 'data': data}) index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) to_join = DataFrame(np.random.randn(10, 3), index=index, columns=['j_one', 'j_two', 'j_three']) joined = data.join(to_join, on=['key1', 'key2'], how='inner') expected = merge(data, to_join.reset_index(), left_on=['key1', 'key2'], right_on=['first', 'second'], how='inner', sort=False) expected2 = merge(to_join, data, right_on=['key1', 'key2'], left_index=True, how='inner', sort=False) assert_frame_equal(joined, expected2.reindex_like(joined)) expected2 = merge(to_join, data, right_on=['key1', 'key2'], left_index=True, how='inner', sort=False) expected = expected.drop(['first', 'second'], axis=1) expected.index = joined.index assert joined.index.is_monotonic assert_frame_equal(joined, expected) # _assert_same_contents(expected, expected2.loc[:, expected.columns]) def test_join_hierarchical_mixed(self): # GH 2024 df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c']) new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]}) other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd']) other_df.set_index('a', inplace=True) # GH 9455, 12219 with tm.assert_produces_warning(UserWarning): result = merge(new_df, other_df, left_index=True, right_index=True) assert ('b', 'mean') in result assert 'b' in result def test_join_float64_float32(self): a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64) b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32) joined = a.join(b) assert joined.dtypes['a'] == 'float64' assert joined.dtypes['b'] == 'float64' assert joined.dtypes['c'] == 'float32' a = np.random.randint(0, 5, 100).astype('int64') b = np.random.random(100).astype('float64') c = np.random.random(100).astype('float32') df = DataFrame({'a': a, 'b': b, 'c': c}) xpdf = DataFrame({'a': a, 'b': b, 'c': c}) s = DataFrame(np.random.random(5).astype('float32'), columns=['md']) rs = df.merge(s, left_on='a', right_index=True) assert rs.dtypes['a'] == 'int64' assert rs.dtypes['b'] == 'float64' assert rs.dtypes['c'] == 'float32' assert rs.dtypes['md'] == 'float32' xp = xpdf.merge(s, left_on='a', right_index=True) assert_frame_equal(rs, xp) def test_join_many_non_unique_index(self): df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]}) df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]}) df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]}) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how='outer') df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer') expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer') result = result.reset_index() expected = expected[result.columns] expected['a'] = expected.a.astype('int64') expected['b'] = expected.b.astype('int64') assert_frame_equal(result, expected) df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) df3 = DataFrame({ "a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000] }) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how='inner') df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner') expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner') result = result.reset_index() assert_frame_equal(result, expected.loc[:, result.columns]) # GH 11519 df = DataFrame({ 'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8) }) s = Series(np.repeat(np.arange(8), 2), index=np.repeat(np.arange(8), 2), name='TEST') inner = df.join(s, how='inner') outer = df.join(s, how='outer') left = df.join(s, how='left') right = df.join(s, how='right') assert_frame_equal(inner, outer) assert_frame_equal(inner, left) assert_frame_equal(inner, right) def test_join_sort(self): left = DataFrame({ 'key': ['foo', 'bar', 'baz', 'foo'], 'value': [1, 2, 3, 4] }) right = DataFrame({'value2': ['a', 'b', 'c']}, index=['bar', 'baz', 'foo']) joined = left.join(right, on='key', sort=True) expected = DataFrame( { 'key': ['bar', 'baz', 'foo', 'foo'], 'value': [2, 3, 1, 4], 'value2': ['a', 'b', 'c', 'c'] }, index=[1, 2, 0, 3]) assert_frame_equal(joined, expected) # smoke test joined = left.join(right, on='key', sort=False) tm.assert_index_equal(joined.index, pd.Index(lrange(4))) def test_join_mixed_non_unique_index(self): # GH 12814, unorderable types in py3 with a non-unique index df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a']) df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4]) result = df1.join(df2) expected = DataFrame( { 'a': [1, 2, 3, 3, 4], 'b': [5, np.nan, 6, 7, np.nan] }, index=[1, 2, 3, 3, 'a']) tm.assert_frame_equal(result, expected) df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a']) df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4]) result = df3.join(df4) expected = DataFrame({ 'a': [1, 2, 3, 4], 'b': [5, 6, 6, np.nan] }, index=[1, 2, 2, 'a']) tm.assert_frame_equal(result, expected) def test_join_non_unique_period_index(self): # GH #16871 index = pd.period_range('2016-01-01', periods=16, freq='M') df = DataFrame([i for i in range(len(index))], index=index, columns=['pnum']) df2 = concat([df, df]) result = df.join(df2, how='inner', rsuffix='_df2') expected = DataFrame(np.tile( np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2), columns=['pnum', 'pnum_df2'], index=df2.sort_index().index) tm.assert_frame_equal(result, expected) def test_mixed_type_join_with_suffix(self): # GH #916 df = DataFrame(np.random.randn(20, 6), columns=['a', 'b', 'c', 'd', 'e', 'f']) df.insert(0, 'id', 0) df.insert(5, 'dt', 'foo') grouped = df.groupby('id') mn = grouped.mean() cn = grouped.count() # it works! mn.join(cn, rsuffix='_right') def test_join_many(self): df = DataFrame(np.random.randn(10, 6), columns=list('abcdef')) df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]] joined = df_list[0].join(df_list[1:]) tm.assert_frame_equal(joined, df) df_list = [ df[['a', 'b']][:-2], df[['c', 'd']][2:], df[['e', 'f']][1:9] ] def _check_diff_index(df_list, result, exp_index): reindexed = [x.reindex(exp_index) for x in df_list] expected = reindexed[0].join(reindexed[1:]) tm.assert_frame_equal(result, expected) # different join types joined = df_list[0].join(df_list[1:], how='outer') _check_diff_index(df_list, joined, df.index) joined = df_list[0].join(df_list[1:]) _check_diff_index(df_list, joined, df_list[0].index) joined = df_list[0].join(df_list[1:], how='inner') _check_diff_index(df_list, joined, df.index[2:8]) pytest.raises(ValueError, df_list[0].join, df_list[1:], on='a') def test_join_many_mixed(self): df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) df['key'] = ['foo', 'bar'] * 4 df1 = df.loc[:, ['A', 'B']] df2 = df.loc[:, ['C', 'D']] df3 = df.loc[:, ['key']] result = df1.join([df2, df3]) assert_frame_equal(result, df) def test_join_dups(self): # joining dups df = concat([ DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']), DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2), columns=['A', 'C']) ], axis=1) expected = concat([df, df], axis=1) result = df.join(df, rsuffix='_2') result.columns = expected.columns assert_frame_equal(result, expected) # GH 4975, invalid join on dups w = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) x = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) y = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) dta = x.merge(y, left_index=True, right_index=True).merge(z, left_index=True, right_index=True, how="outer") dta = dta.merge(w, left_index=True, right_index=True) expected = concat([x, y, z, w], axis=1) expected.columns = [ 'x_x', 'y_x', 'x_y', 'y_y', 'x_x', 'y_x', 'x_y', 'y_y' ] assert_frame_equal(dta, expected) def test_panel_join(self): with catch_warnings(record=True): panel = tm.makePanel() tm.add_nans(panel) p1 = panel.iloc[:2, :10, :3] p2 = panel.iloc[2:, 5:, 2:] # left join result = p1.join(p2) expected = p1.copy() expected['ItemC'] = p2['ItemC'] tm.assert_panel_equal(result, expected) # right join result = p1.join(p2, how='right') expected = p2.copy() expected['ItemA'] = p1['ItemA'] expected['ItemB'] = p1['ItemB'] expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC']) tm.assert_panel_equal(result, expected) # inner join result = p1.join(p2, how='inner') expected = panel.iloc[:, 5:10, 2:3] tm.assert_panel_equal(result, expected) # outer join result = p1.join(p2, how='outer') expected = p1.reindex(major=panel.major_axis, minor=panel.minor_axis) expected = expected.join( p2.reindex(major=panel.major_axis, minor=panel.minor_axis)) tm.assert_panel_equal(result, expected) def test_panel_join_overlap(self): with catch_warnings(record=True): panel = tm.makePanel() tm.add_nans(panel) p1 = panel.loc[['ItemA', 'ItemB', 'ItemC']] p2 = panel.loc[['ItemB', 'ItemC']] # Expected index is # # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2 joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2') p1_suf = p1.loc[['ItemB', 'ItemC']].add_suffix('_p1') p2_suf = p2.loc[['ItemB', 'ItemC']].add_suffix('_p2') no_overlap = panel.loc[['ItemA']] expected = no_overlap.join(p1_suf.join(p2_suf)) tm.assert_panel_equal(joined, expected) def test_panel_join_many(self): with catch_warnings(record=True): tm.K = 10 panel = tm.makePanel() tm.K = 4 panels = [panel.iloc[:2], panel.iloc[2:6], panel.iloc[6:]] joined = panels[0].join(panels[1:]) tm.assert_panel_equal(joined, panel) panels = [ panel.iloc[:2, :-5], panel.iloc[2:6, 2:], panel.iloc[6:, 5:-7] ] data_dict = {} for p in panels: data_dict.update(p.iteritems()) joined = panels[0].join(panels[1:], how='inner') expected = pd.Panel.from_dict(data_dict, intersect=True) tm.assert_panel_equal(joined, expected) joined = panels[0].join(panels[1:], how='outer') expected = pd.Panel.from_dict(data_dict, intersect=False) tm.assert_panel_equal(joined, expected) # edge cases pytest.raises(ValueError, panels[0].join, panels[1:], how='outer', lsuffix='foo', rsuffix='bar') pytest.raises(ValueError, panels[0].join, panels[1:], how='right') def test_join_multi_to_multi(self, join_type): # GH 20475 leftindex = MultiIndex.from_product( [list('abc'), list('xy'), [1, 2]], names=['abc', 'xy', 'num']) left = DataFrame({'v1': range(12)}, index=leftindex) rightindex = MultiIndex.from_product( [list('abc'), list('xy')], names=['abc', 'xy']) right = DataFrame({'v2': [100 * i for i in range(1, 7)]}, index=rightindex) result = left.join(right, on=['abc', 'xy'], how=join_type) expected = (left.reset_index().merge(right.reset_index(), on=['abc', 'xy'], how=join_type).set_index( ['abc', 'xy', 'num'])) assert_frame_equal(expected, result) with pytest.raises(ValueError): left.join(right, on='xy', how=join_type) with pytest.raises(ValueError): right.join(left, on=['abc', 'xy'], how=join_type)
def test_getitem_setitem_float_labels(self): index = Index([1.5, 2, 3, 4, 5]) df = DataFrame(np.random.randn(5, 5), index=index) result = df.loc[1.5:4] expected = df.reindex([1.5, 2, 3, 4]) tm.assert_frame_equal(result, expected) assert len(result) == 4 result = df.loc[4:5] expected = df.reindex([4, 5]) # reindex with int tm.assert_frame_equal(result, expected, check_index_type=False) assert len(result) == 2 result = df.loc[4:5] expected = df.reindex([4.0, 5.0]) # reindex with float tm.assert_frame_equal(result, expected) assert len(result) == 2 # loc_float changes this to work properly result = df.loc[1:2] expected = df.iloc[0:2] tm.assert_frame_equal(result, expected) df.loc[1:2] = 0 result = df[1:2] assert (result == 0).all().all() # #2727 index = Index([1.0, 2.5, 3.5, 4.5, 5.0]) df = DataFrame(np.random.randn(5, 5), index=index) # positional slicing only via iloc! msg = ("cannot do positional indexing on Float64Index with " r"these indexers \[1.0\] of type float") with pytest.raises(TypeError, match=msg): df.iloc[1.0:5] result = df.iloc[4:5] expected = df.reindex([5.0]) tm.assert_frame_equal(result, expected) assert len(result) == 1 cp = df.copy() with pytest.raises(TypeError, match=_slice_msg): cp.iloc[1.0:5] = 0 with pytest.raises(TypeError, match=msg): result = cp.iloc[1.0:5] == 0 assert result.values.all() assert (cp.iloc[0:1] == df.iloc[0:1]).values.all() cp = df.copy() cp.iloc[4:5] = 0 assert (cp.iloc[4:5] == 0).values.all() assert (cp.iloc[0:4] == df.iloc[0:4]).values.all() # float slicing result = df.loc[1.0:5] expected = df tm.assert_frame_equal(result, expected) assert len(result) == 5 result = df.loc[1.1:5] expected = df.reindex([2.5, 3.5, 4.5, 5.0]) tm.assert_frame_equal(result, expected) assert len(result) == 4 result = df.loc[4.51:5] expected = df.reindex([5.0]) tm.assert_frame_equal(result, expected) assert len(result) == 1 result = df.loc[1.0:5.0] expected = df.reindex([1.0, 2.5, 3.5, 4.5, 5.0]) tm.assert_frame_equal(result, expected) assert len(result) == 5 cp = df.copy() cp.loc[1.0:5.0] = 0 result = cp.loc[1.0:5.0] assert (result == 0).values.all()
def test_basic(): cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], ordered=True) data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True) expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index) result = data.groupby("b", observed=False).mean() tm.assert_frame_equal(result, expected) cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) # single grouper gb = df.groupby("A", observed=False) exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True) expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)}) result = gb.sum() tm.assert_frame_equal(result, expected) # GH 8623 x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], [1, 'John P. Doe']], columns=['person_id', 'person_name']) x['person_name'] = Categorical(x.person_name) g = x.groupby(['person_id'], observed=False) result = g.transform(lambda x: x) tm.assert_frame_equal(result, x[['person_name']]) result = x.drop_duplicates('person_name') expected = x.iloc[[0, 1]] tm.assert_frame_equal(result, expected) def f(x): return x.drop_duplicates('person_name').iloc[0] result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name='person_id') expected['person_name'] = expected['person_name'].astype('object') tm.assert_frame_equal(result, expected) # GH 9921 # Monotonic df = DataFrame({"a": [5, 15, 25]}) c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) result = df.a.groupby(c, observed=False).transform(sum) tm.assert_series_equal(result, df['a']) tm.assert_series_equal( df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df['a']) tm.assert_frame_equal( df.groupby(c, observed=False).transform(sum), df[['a']]) tm.assert_frame_equal( df.groupby(c, observed=False).transform(lambda xs: np.max(xs)), df[['a']]) # Filter tm.assert_series_equal( df.a.groupby(c, observed=False).filter(np.all), df['a']) tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df) # Non-monotonic df = DataFrame({"a": [5, 15, 25, -5]}) c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) result = df.a.groupby(c, observed=False).transform(sum) tm.assert_series_equal(result, df['a']) tm.assert_series_equal( df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df['a']) tm.assert_frame_equal( df.groupby(c, observed=False).transform(sum), df[['a']]) tm.assert_frame_equal( df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df[['a']]) # GH 9603 df = DataFrame({'a': [1, 0, 0, 0]}) c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd'))) result = df.groupby(c, observed=False).apply(len) exp_index = CategoricalIndex(c.values.categories, ordered=c.values.ordered) expected = Series([1, 0, 0, 0], index=exp_index) expected.index.name = 'a' tm.assert_series_equal(result, expected) # more basic levels = ['foo', 'bar', 'baz', 'qux'] codes = np.random.randint(0, 4, size=100) cats = Categorical.from_codes(codes, levels, ordered=True) data = DataFrame(np.random.randn(100, 4)) result = data.groupby(cats, observed=False).mean() expected = data.groupby(np.asarray(cats), observed=False).mean() exp_idx = CategoricalIndex(levels, categories=cats.categories, ordered=True) expected = expected.reindex(exp_idx) assert_frame_equal(result, expected) grouped = data.groupby(cats, observed=False) desc_result = grouped.describe() idx = cats.codes.argsort() ord_labels = np.asarray(cats).take(idx) ord_data = data.take(idx) exp_cats = Categorical(ord_labels, ordered=True, categories=['foo', 'bar', 'baz', 'qux']) expected = ord_data.groupby(exp_cats, sort=False, observed=False).describe() assert_frame_equal(desc_result, expected) # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] * 4) tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp)
# 当reindex时指定的index少于原有的index的情况 print(s1) print(s1.reindex(index=['A', 'B'])) # drop删除指定index值 print(s1) print(s1.drop('A')) print("{{{{{{{{{{{{{{{{{}}}}}}}}}}}") # dataframe的reindex # 改变dataframe的index和column: df1 = DataFrame(np.random.rand(25).reshape([5, 5]), index=['A', 'B', 'D', 'E', 'F'], columns=['c1', 'c2', 'c3', 'c4', 'c5']) print(df1) print(df1.reindex(index=['A', 'B', 'C', 'D', 'E', 'F'])) print(df1.reindex(columns=['c1', 'c2', 'c3', 'c4', 'c5', 'c6'])) # 同时改变dataframe的index和column: print( df1.reindex(index=['A', 'B', 'C', 'D', 'E', 'F'], columns=['c1', 'c2', 'c3', 'c4', 'c5', 'c6'])) # 当reindex时指定的index少于原有的index的情况: print(df1) print(df1.reindex(index=['A', 'B'])) # dataframe的drop操作 print(df1) print(df1.drop('A', axis=0)) print(df1.drop('c1', axis=1))
def test_observed(observed): # multiple groupers, don't re-expand the output space # of the grouper # gh-14942 (implement) # gh-10132 (back-compat) # gh-8138 (back-compat) # gh-8869 cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) df["C"] = ["foo", "bar"] * 2 # multiple groupers with a non-cat gb = df.groupby(["A", "B", "C"], observed=observed) exp_index = MultiIndex.from_arrays( [cat1, cat2, ["foo", "bar"] * 2], names=["A", "B", "C"] ) expected = DataFrame({"values": Series([1, 2, 3, 4], index=exp_index)}).sort_index() result = gb.sum() if not observed: expected = cartesian_product_for_groupers( expected, [cat1, cat2, ["foo", "bar"]], list("ABC") ) tm.assert_frame_equal(result, expected) gb = df.groupby(["A", "B"], observed=observed) exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"]) expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index) result = gb.sum() if not observed: expected = cartesian_product_for_groupers(expected, [cat1, cat2], list("AB")) tm.assert_frame_equal(result, expected) # https://github.com/pandas-dev/pandas/issues/8138 d = { "cat": Categorical( ["a", "b", "a", "b"], categories=["a", "b", "c"], ordered=True ), "ints": [1, 1, 2, 2], "val": [10, 20, 30, 40], } df = DataFrame(d) # Grouping on a single column groups_single_key = df.groupby("cat", observed=observed) result = groups_single_key.mean() exp_index = CategoricalIndex( list("ab"), name="cat", categories=list("abc"), ordered=True ) expected = DataFrame({"ints": [1.5, 1.5], "val": [20.0, 30]}, index=exp_index) if not observed: index = CategoricalIndex( list("abc"), name="cat", categories=list("abc"), ordered=True ) expected = expected.reindex(index) tm.assert_frame_equal(result, expected) # Grouping on two columns groups_double_key = df.groupby(["cat", "ints"], observed=observed) result = groups_double_key.agg("mean") expected = DataFrame( { "val": [10, 30, 20, 40], "cat": Categorical( ["a", "a", "b", "b"], categories=["a", "b", "c"], ordered=True ), "ints": [1, 2, 1, 2], } ).set_index(["cat", "ints"]) if not observed: expected = cartesian_product_for_groupers( expected, [df.cat.values, [1, 2]], ["cat", "ints"] ) tm.assert_frame_equal(result, expected) # GH 10132 for key in [("a", 1), ("b", 2), ("b", 1), ("a", 2)]: c, i = key result = groups_double_key.get_group(key) expected = df[(df.cat == c) & (df.ints == i)] tm.assert_frame_equal(result, expected) # gh-8869 # with as_index d = { "foo": [10, 8, 4, 8, 4, 1, 1], "bar": [10, 20, 30, 40, 50, 60, 70], "baz": ["d", "c", "e", "a", "a", "d", "c"], } df = DataFrame(d) cat = pd.cut(df["foo"], np.linspace(0, 10, 3)) df["range"] = cat groups = df.groupby(["range", "baz"], as_index=False, observed=observed) result = groups.agg("mean") groups2 = df.groupby(["range", "baz"], as_index=True, observed=observed) expected = groups2.agg("mean").reset_index() tm.assert_frame_equal(result, expected)
def construct_datasets(Origin, min_year=2008, subsample=None): import MySQLdb ## print ' Connecting to database...' ## time0 = time.time() db = MySQLdb.connect(host="localhost", user="******", passwd="z2yv52K*hJ<otclN", db="DelayMeNot", local_infile=1) ## print ' That took %.1f seconds' % (time.time() - time0) cur = db.cursor() ### flights origination from Origin print ' Querying database...' time0 = time.time() ## if min_year == 2008: ## cur.execute("SELECT Year, Month, DayofMonth, DayOfWeek, CRSDepTime, CRSArrTime, UniqueCarrier, CRSElapsedTime, ArrDelay, Dest, Distance FROM flights_2008_2013 WHERE Origin = '%s';" % (Origin)) ## else: ## cur.execute("SELECT Year, Month, DayofMonth, DayOfWeek, CRSDepTime, CRSArrTime, UniqueCarrier, CRSElapsedTime, ArrDelay, Dest, Distance FROM flights WHERE Origin = '%s' AND Year >= %d;" % (Origin, min_year)) ## print "SELECT Year, Month, DayofMonth, DayOfWeek, CRSDepTime, CRSArrTime, UniqueCarrier, CRSElapsedTime, ArrDelay, Dest, Distance FROM flights WHERE Origin = '%s' AND Year >= %d;" % (Origin, min_year) cur.execute( "SELECT Year, Month, DayofMonth, DayOfWeek, CRSDepTime, CRSArrTime, UniqueCarrier, CRSElapsedTime, ArrDelay, Dest, Distance FROM flights WHERE Origin = '%s' AND Year >= %d;" % (Origin, min_year)) print ' That took %.1f seconds.' % (time.time() - time0) rows = cur.fetchall() ### Convert to DataFrame print ' Converting to DataFrame...' time0 = time.time() df = DataFrame(list(rows), columns=[ 'Year', 'Month', 'DayOfMonth', 'DayOfWeek', 'ScheduledDepartureTime', 'ScheduledArrivalTime', 'Carrier', 'ScheduledElapsedTime', 'ArrivalDelay', 'Destination', 'Distance' ]) del rows ### drop columns without delays (cancellations) df = df.dropna() ### Create some auxiliary columns df['DayOfYear'] = df.apply(lambda x: datetime.datetime( x['Year'], x['Month'], x['DayOfMonth']).timetuple().tm_yday, axis=1) df['Week'] = df['DayOfYear'] / 7 + 1 df['ScheduledDepartureHour'] = df['ScheduledDepartureTime'] / 100 + df[ 'ScheduledDepartureTime'] % 100 / 60.0 df['ScheduledArrivalHour'] = df['ScheduledArrivalTime'] / 100 + df[ 'ScheduledArrivalTime'] % 100 / 60.0 df = df.drop(['ScheduledDepartureTime', 'ScheduledArrivalTime'], axis=1) ### Add DaysFromNearestHoliday column df = distance_to_holiday(df) ## df.head() print ' That took %.1f seconds.' % (time.time() - time0) ### subsample by a factor of 10 if subsample is not None: print ' Subsampling (x%d) DataFrame...' % subsample time0 = time.time() df = df.ix[::subsample] print ' That took %.1f seconds.' % (time.time() - time0) ## print len(df) ## ### Normalize columns (*** this isn't necessary for RandomForest ***) ## def normalize(var): ## return (var - var.min()).astype(float) / (var.max() - var.min()) ## df['Year'] = normalize(df['Year']) ## df['Month'] = normalize(df['Month']) ## df['DayOfMonth'] = normalize(df['DayOfMonth']) ## df['DayOfWeek'] = normalize(df['DayOfWeek']) ## df['DayOfYear'] = normalize(df['DayOfYear']) ## df['Week'] = normalize(df['Week']) ## df['ScheduledDepartureHour'] = normalize(df['ScheduledDepartureHour']) ## df['ScheduledArrivalHour'] = normalize(df['ScheduledArrivalHour']) ## df['ScheduledElapsedTime'] = normalize(df['ScheduledElapsedTime']) ## df['Distance'] = normalize(df['Distance']) ## df['DaysFromNearestHoliday'] = normalize(df['DaysFromNearestHoliday']) ## print df.head() ##### Dummification should happen after unpickling, since including all the dummified columns makes the pickles huge! ## ### "Dummify" the categorical 'Carrier' and 'Destination' columns, ## ### and add the dummies to the table, but drop the first dummy ## ### column to avoid "dummy variable trap". ## dummies = pd.get_dummies(df['Carrier'],prefix='Carrier') ## ## ## print dummies.columns ## df = df.join(dummies.ix[:,1:]) ## dummies = pd.get_dummies(df['Destination'],prefix='Destination') ## df = df.join(dummies.ix[:,1:]) ## ### Drop dummified columns ## df = df.drop(['Carrier','Destination'],axis=1) ## print len(df.columns) ## print df.head() ### Shuffle and create separate train and test datasets print ' Separating into training and testing dataset...' time0 = time.time() df = df.reindex(np.random.permutation(df.index)) Nrow = len(df) Ntrain = int(2.0 / 3.0 * Nrow) Ntest = Nrow - Ntrain data_train = df[:Ntrain] data_test = df[Ntrain:] del df print ' That took %.1f seconds.' % (time.time() - time0) ### Close up the cursor and database cur.close() db.close() return (data_train, data_test)
def test_getitem_boolean(self, mixed_float_frame, mixed_int_frame, datetime_frame): # boolean indexing d = datetime_frame.index[10] indexer = datetime_frame.index > d indexer_obj = indexer.astype(object) subindex = datetime_frame.index[indexer] subframe = datetime_frame[indexer] tm.assert_index_equal(subindex, subframe.index) with pytest.raises(ValueError, match="Item wrong length"): datetime_frame[indexer[:-1]] subframe_obj = datetime_frame[indexer_obj] tm.assert_frame_equal(subframe_obj, subframe) with pytest.raises(ValueError, match="Boolean array expected"): datetime_frame[datetime_frame] # test that Series work indexer_obj = Series(indexer_obj, datetime_frame.index) subframe_obj = datetime_frame[indexer_obj] tm.assert_frame_equal(subframe_obj, subframe) # test that Series indexers reindex # we are producing a warning that since the passed boolean # key is not the same as the given index, we will reindex # not sure this is really necessary with tm.assert_produces_warning(UserWarning): indexer_obj = indexer_obj.reindex(datetime_frame.index[::-1]) subframe_obj = datetime_frame[indexer_obj] tm.assert_frame_equal(subframe_obj, subframe) # test df[df > 0] for df in [ datetime_frame, mixed_float_frame, mixed_int_frame, ]: data = df._get_numeric_data() bif = df[df > 0] bifw = DataFrame( { c: np.where(data[c] > 0, data[c], np.nan) for c in data.columns }, index=data.index, columns=data.columns, ) # add back other columns to compare for c in df.columns: if c not in bifw: bifw[c] = df[c] bifw = bifw.reindex(columns=df.columns) tm.assert_frame_equal(bif, bifw, check_dtype=False) for c in df.columns: if bif[c].dtype != bifw[c].dtype: assert bif[c].dtype == df[c].dtype
nfl_frame.head(3) nfl_frame.ix[3] # Add new columns to an existing data frame nfl_frame['Stadium'] = np.arrange(5) stadiums = Series(["Levi's Stadium", "AT&T Stadium"], index=[4, 0]) nfl_frame['Stadium'] = stadiums del nfl_frame['Stadium'] # Create data frames from dictionaries data = {'City': ['SF', 'LA', 'NYC'], 'Population': [837000, 388000, 840000]} city_frame = DataFrame(data) # Reindex from numpy.random import randn ser1 = Series([1, 2, 3, 4], index=['A', 'B', 'C', 'D']) my_index = ser1.index ser2 = ser1.reindex(['A', 'B', 'C', 'D', 'E', 'F']) ser2.reindex(['A', 'B', 'C', 'D', 'E', 'F', 'G'], fill_value=0) ser3 = Series(['USA', 'Mexico', 'Canada'], index=[0, 5, 10]) ser3.reindex(range(15), method='ffill') dframe = DataFrame(randn(25).reshape((5, 5)), index=['A', 'B', 'D', 'E', 'F'], columns=['col1', 'col2', 'col3', 'col4', 'col5']) new_columns = ['col1', 'col2', 'col3', 'col4', 'col5', 'col6'] dframe2 = dframe.reindex( ['A', 'B', 'C', 'D', 'E', 'F']) #### This line and the line below are equivalent to the last line dframe2.reindex(columns=new_columns) #### dframe.ix[['A', 'B', 'C', 'D', 'E', 'F'], new_columns]
class TestCategoricalIndex: def setup_method(self, method): self.df = DataFrame({ "A": np.arange(6, dtype="int64"), "B": Series(list("aabbca")).astype(CDT(list("cab"))), }).set_index("B") self.df2 = DataFrame({ "A": np.arange(6, dtype="int64"), "B": Series(list("aabbca")).astype(CDT(list("cabe"))), }).set_index("B") self.df3 = DataFrame({ "A": np.arange(6, dtype="int64"), "B": (Series([1, 1, 2, 1, 3, 2]).astype(CDT([3, 2, 1], ordered=True))), }).set_index("B") self.df4 = DataFrame({ "A": np.arange(6, dtype="int64"), "B": (Series([1, 1, 2, 1, 3, 2]).astype(CDT([3, 2, 1], ordered=False))), }).set_index("B") def test_loc_scalar(self): result = self.df.loc["a"] expected = DataFrame({ "A": [0, 1, 5], "B": (Series(list("aaa")).astype(CDT(list("cab")))) }).set_index("B") tm.assert_frame_equal(result, expected) df = self.df.copy() df.loc["a"] = 20 expected = DataFrame({ "A": [20, 20, 2, 3, 4, 20], "B": (Series(list("aabbca")).astype(CDT(list("cab")))), }).set_index("B") tm.assert_frame_equal(df, expected) # value not in the categories with pytest.raises(KeyError, match=r"^'d'$"): df.loc["d"] msg = "cannot append a non-category item to a CategoricalIndex" with pytest.raises(TypeError, match=msg): df.loc["d"] = 10 msg = ("cannot insert an item into a CategoricalIndex that is not" " already an existing category") with pytest.raises(TypeError, match=msg): df.loc["d", "A"] = 10 with pytest.raises(TypeError, match=msg): df.loc["d", "C"] = 10 msg = ( r"cannot do label indexing on <class 'pandas\.core\.indexes\.category" r"\.CategoricalIndex'> with these indexers \[1\] of <class 'int'>") with pytest.raises(TypeError, match=msg): df.loc[1] def test_getitem_scalar(self): cats = Categorical([Timestamp("12-31-1999"), Timestamp("12-31-2000")]) s = Series([1, 2], index=cats) expected = s.iloc[0] result = s[cats[0]] assert result == expected def test_slicing_directly(self): cat = Categorical(["a", "b", "c", "d", "a", "b", "c"]) sliced = cat[3] assert sliced == "d" sliced = cat[3:5] expected = Categorical(["d", "a"], categories=["a", "b", "c", "d"]) tm.assert_numpy_array_equal(sliced._codes, expected._codes) tm.assert_index_equal(sliced.categories, expected.categories) def test_slicing(self): cat = Series(Categorical([1, 2, 3, 4])) reversed = cat[::-1] exp = np.array([4, 3, 2, 1], dtype=np.int64) tm.assert_numpy_array_equal(reversed.__array__(), exp) df = DataFrame({"value": (np.arange(100) + 1).astype("int64")}) df["D"] = pd.cut(df.value, bins=[0, 25, 50, 75, 100]) expected = Series([11, Interval(0, 25)], index=["value", "D"], name=10) result = df.iloc[10] tm.assert_series_equal(result, expected) expected = DataFrame( {"value": np.arange(11, 21).astype("int64")}, index=np.arange(10, 20).astype("int64"), ) expected["D"] = pd.cut(expected.value, bins=[0, 25, 50, 75, 100]) result = df.iloc[10:20] tm.assert_frame_equal(result, expected) expected = Series([9, Interval(0, 25)], index=["value", "D"], name=8) result = df.loc[8] tm.assert_series_equal(result, expected) def test_slicing_and_getting_ops(self): # systematically test the slicing operations: # for all slicing ops: # - returning a dataframe # - returning a column # - returning a row # - returning a single value cats = Categorical(["a", "c", "b", "c", "c", "c", "c"], categories=["a", "b", "c"]) idx = Index(["h", "i", "j", "k", "l", "m", "n"]) values = [1, 2, 3, 4, 5, 6, 7] df = DataFrame({"cats": cats, "values": values}, index=idx) # the expected values cats2 = Categorical(["b", "c"], categories=["a", "b", "c"]) idx2 = Index(["j", "k"]) values2 = [3, 4] # 2:4,: | "j":"k",: exp_df = DataFrame({"cats": cats2, "values": values2}, index=idx2) # :,"cats" | :,0 exp_col = Series(cats, index=idx, name="cats") # "j",: | 2,: exp_row = Series(["b", 3], index=["cats", "values"], dtype="object", name="j") # "j","cats | 2,0 exp_val = "b" # iloc # frame res_df = df.iloc[2:4, :] tm.assert_frame_equal(res_df, exp_df) assert is_categorical_dtype(res_df["cats"]) # row res_row = df.iloc[2, :] tm.assert_series_equal(res_row, exp_row) assert isinstance(res_row["cats"], str) # col res_col = df.iloc[:, 0] tm.assert_series_equal(res_col, exp_col) assert is_categorical_dtype(res_col) # single value res_val = df.iloc[2, 0] assert res_val == exp_val # loc # frame res_df = df.loc["j":"k", :] tm.assert_frame_equal(res_df, exp_df) assert is_categorical_dtype(res_df["cats"]) # row res_row = df.loc["j", :] tm.assert_series_equal(res_row, exp_row) assert isinstance(res_row["cats"], str) # col res_col = df.loc[:, "cats"] tm.assert_series_equal(res_col, exp_col) assert is_categorical_dtype(res_col) # single value res_val = df.loc["j", "cats"] assert res_val == exp_val # ix # frame # res_df = df.loc["j":"k",[0,1]] # doesn't work? res_df = df.loc["j":"k", :] tm.assert_frame_equal(res_df, exp_df) assert is_categorical_dtype(res_df["cats"]) # row res_row = df.loc["j", :] tm.assert_series_equal(res_row, exp_row) assert isinstance(res_row["cats"], str) # col res_col = df.loc[:, "cats"] tm.assert_series_equal(res_col, exp_col) assert is_categorical_dtype(res_col) # single value res_val = df.loc["j", df.columns[0]] assert res_val == exp_val # iat res_val = df.iat[2, 0] assert res_val == exp_val # at res_val = df.at["j", "cats"] assert res_val == exp_val # fancy indexing exp_fancy = df.iloc[[2]] res_fancy = df[df["cats"] == "b"] tm.assert_frame_equal(res_fancy, exp_fancy) res_fancy = df[df["values"] == 3] tm.assert_frame_equal(res_fancy, exp_fancy) # get_value res_val = df.at["j", "cats"] assert res_val == exp_val # i : int, slice, or sequence of integers res_row = df.iloc[2] tm.assert_series_equal(res_row, exp_row) assert isinstance(res_row["cats"], str) res_df = df.iloc[slice(2, 4)] tm.assert_frame_equal(res_df, exp_df) assert is_categorical_dtype(res_df["cats"]) res_df = df.iloc[[2, 3]] tm.assert_frame_equal(res_df, exp_df) assert is_categorical_dtype(res_df["cats"]) res_col = df.iloc[:, 0] tm.assert_series_equal(res_col, exp_col) assert is_categorical_dtype(res_col) res_df = df.iloc[:, slice(0, 2)] tm.assert_frame_equal(res_df, df) assert is_categorical_dtype(res_df["cats"]) res_df = df.iloc[:, [0, 1]] tm.assert_frame_equal(res_df, df) assert is_categorical_dtype(res_df["cats"]) def test_slicing_doc_examples(self): # GH 7918 cats = Categorical(["a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c"]) idx = Index(["h", "i", "j", "k", "l", "m", "n"]) values = [1, 2, 2, 2, 3, 4, 5] df = DataFrame({"cats": cats, "values": values}, index=idx) result = df.iloc[2:4, :] expected = DataFrame( { "cats": Categorical(["b", "b"], categories=["a", "b", "c"]), "values": [2, 2], }, index=["j", "k"], ) tm.assert_frame_equal(result, expected) result = df.iloc[2:4, :].dtypes expected = Series(["category", "int64"], ["cats", "values"]) tm.assert_series_equal(result, expected) result = df.loc["h":"j", "cats"] expected = Series( Categorical(["a", "b", "b"], categories=["a", "b", "c"]), index=["h", "i", "j"], name="cats", ) tm.assert_series_equal(result, expected) result = df.loc["h":"j", df.columns[0:1]] expected = DataFrame( {"cats": Categorical(["a", "b", "b"], categories=["a", "b", "c"])}, index=["h", "i", "j"], ) tm.assert_frame_equal(result, expected) def test_getitem_category_type(self): # GH 14580 # test iloc() on Series with Categorical data s = Series([1, 2, 3]).astype("category") # get slice result = s.iloc[0:2] expected = Series([1, 2]).astype(CategoricalDtype([1, 2, 3])) tm.assert_series_equal(result, expected) # get list of indexes result = s.iloc[[0, 1]] expected = Series([1, 2]).astype(CategoricalDtype([1, 2, 3])) tm.assert_series_equal(result, expected) # get boolean array result = s.iloc[[True, False, False]] expected = Series([1]).astype(CategoricalDtype([1, 2, 3])) tm.assert_series_equal(result, expected) def test_loc_listlike(self): # list of labels result = self.df.loc[["c", "a"]] expected = self.df.iloc[[4, 0, 1, 5]] tm.assert_frame_equal(result, expected, check_index_type=True) result = self.df2.loc[["a", "b", "e"]] exp_index = CategoricalIndex(list("aaabbe"), categories=list("cabe"), name="B") expected = DataFrame({"A": [0, 1, 5, 2, 3, np.nan]}, index=exp_index) tm.assert_frame_equal(result, expected, check_index_type=True) # element in the categories but not in the values with pytest.raises(KeyError, match=r"^'e'$"): self.df2.loc["e"] # assign is ok df = self.df2.copy() df.loc["e"] = 20 result = df.loc[["a", "b", "e"]] exp_index = CategoricalIndex(list("aaabbe"), categories=list("cabe"), name="B") expected = DataFrame({"A": [0, 1, 5, 2, 3, 20]}, index=exp_index) tm.assert_frame_equal(result, expected) df = self.df2.copy() result = df.loc[["a", "b", "e"]] exp_index = CategoricalIndex(list("aaabbe"), categories=list("cabe"), name="B") expected = DataFrame({"A": [0, 1, 5, 2, 3, np.nan]}, index=exp_index) tm.assert_frame_equal(result, expected, check_index_type=True) # not all labels in the categories with pytest.raises( KeyError, match="'a list-indexer must only include values that are in the" " categories'", ): self.df2.loc[["a", "d"]] def test_loc_listlike_dtypes(self): # GH 11586 # unique categories and codes index = CategoricalIndex(["a", "b", "c"]) df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index) # unique slice res = df.loc[["a", "b"]] exp_index = CategoricalIndex(["a", "b"], categories=index.categories) exp = DataFrame({"A": [1, 2], "B": [4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice res = df.loc[["a", "a", "b"]] exp_index = CategoricalIndex(["a", "a", "b"], categories=index.categories) exp = DataFrame({"A": [1, 1, 2], "B": [4, 4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) msg = "a list-indexer must only include values that are in the categories" with pytest.raises(KeyError, match=msg): df.loc[["a", "x"]] # duplicated categories and codes index = CategoricalIndex(["a", "b", "a"]) df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index) # unique slice res = df.loc[["a", "b"]] exp = DataFrame({ "A": [1, 3, 2], "B": [4, 6, 5] }, index=CategoricalIndex(["a", "a", "b"])) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice res = df.loc[["a", "a", "b"]] exp = DataFrame( { "A": [1, 3, 1, 3, 2], "B": [4, 6, 4, 6, 5] }, index=CategoricalIndex(["a", "a", "a", "a", "b"]), ) tm.assert_frame_equal(res, exp, check_index_type=True) msg = "a list-indexer must only include values that are in the categories" with pytest.raises(KeyError, match=msg): df.loc[["a", "x"]] # contains unused category index = CategoricalIndex(["a", "b", "a", "c"], categories=list("abcde")) df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}, index=index) res = df.loc[["a", "b"]] exp = DataFrame( { "A": [1, 3, 2], "B": [5, 7, 6] }, index=CategoricalIndex(["a", "a", "b"], categories=list("abcde")), ) tm.assert_frame_equal(res, exp, check_index_type=True) res = df.loc[["a", "e"]] exp = DataFrame( { "A": [1, 3, np.nan], "B": [5, 7, np.nan] }, index=CategoricalIndex(["a", "a", "e"], categories=list("abcde")), ) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice res = df.loc[["a", "a", "b"]] exp = DataFrame( { "A": [1, 3, 1, 3, 2], "B": [5, 7, 5, 7, 6] }, index=CategoricalIndex(["a", "a", "a", "a", "b"], categories=list("abcde")), ) tm.assert_frame_equal(res, exp, check_index_type=True) msg = "a list-indexer must only include values that are in the categories" with pytest.raises(KeyError, match=msg): df.loc[["a", "x"]] def test_get_indexer_array(self): arr = np.array( [ Timestamp("1999-12-31 00:00:00"), Timestamp("2000-12-31 00:00:00") ], dtype=object, ) cats = [ Timestamp("1999-12-31 00:00:00"), Timestamp("2000-12-31 00:00:00") ] ci = CategoricalIndex(cats, categories=cats, ordered=False, dtype="category") result = ci.get_indexer(arr) expected = np.array([0, 1], dtype="intp") tm.assert_numpy_array_equal(result, expected) def test_get_indexer_same_categories_same_order(self): ci = CategoricalIndex(["a", "b"], categories=["a", "b"]) result = ci.get_indexer( CategoricalIndex(["b", "b"], categories=["a", "b"])) expected = np.array([1, 1], dtype="intp") tm.assert_numpy_array_equal(result, expected) def test_get_indexer_same_categories_different_order(self): # https://github.com/pandas-dev/pandas/issues/19551 ci = CategoricalIndex(["a", "b"], categories=["a", "b"]) result = ci.get_indexer( CategoricalIndex(["b", "b"], categories=["b", "a"])) expected = np.array([1, 1], dtype="intp") tm.assert_numpy_array_equal(result, expected) def test_getitem_with_listlike(self): # GH 16115 cats = Categorical([Timestamp("12-31-1999"), Timestamp("12-31-2000")]) expected = DataFrame([[1, 0], [0, 1]], dtype="uint8", index=[0, 1], columns=cats) dummies = pd.get_dummies(cats) result = dummies[list(dummies.columns)] tm.assert_frame_equal(result, expected) def test_setitem_listlike(self): # GH 9469 # properly coerce the input indexers np.random.seed(1) c = Categorical(np.random.randint(0, 5, size=150000).astype( np.int8)).add_categories([-1000]) indexer = np.array([100000]).astype(np.int64) c[indexer] = -1000 # we are asserting the code result here # which maps to the -1000 category result = c.codes[np.array([100000]).astype(np.int64)] tm.assert_numpy_array_equal(result, np.array([5], dtype="int8")) def test_ix_categorical_index(self): # GH 12531 df = DataFrame(np.random.randn(3, 3), index=list("ABC"), columns=list("XYZ")) cdf = df.copy() cdf.index = CategoricalIndex(df.index) cdf.columns = CategoricalIndex(df.columns) expect = Series(df.loc["A", :], index=cdf.columns, name="A") tm.assert_series_equal(cdf.loc["A", :], expect) expect = Series(df.loc[:, "X"], index=cdf.index, name="X") tm.assert_series_equal(cdf.loc[:, "X"], expect) exp_index = CategoricalIndex(list("AB"), categories=["A", "B", "C"]) expect = DataFrame(df.loc[["A", "B"], :], columns=cdf.columns, index=exp_index) tm.assert_frame_equal(cdf.loc[["A", "B"], :], expect) exp_columns = CategoricalIndex(list("XY"), categories=["X", "Y", "Z"]) expect = DataFrame(df.loc[:, ["X", "Y"]], index=cdf.index, columns=exp_columns) tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect) # non-unique df = DataFrame(np.random.randn(3, 3), index=list("ABA"), columns=list("XYX")) cdf = df.copy() cdf.index = CategoricalIndex(df.index) cdf.columns = CategoricalIndex(df.columns) exp_index = CategoricalIndex(list("AA"), categories=["A", "B"]) expect = DataFrame(df.loc["A", :], columns=cdf.columns, index=exp_index) tm.assert_frame_equal(cdf.loc["A", :], expect) exp_columns = CategoricalIndex(list("XX"), categories=["X", "Y"]) expect = DataFrame(df.loc[:, "X"], index=cdf.index, columns=exp_columns) tm.assert_frame_equal(cdf.loc[:, "X"], expect) expect = DataFrame( df.loc[["A", "B"], :], columns=cdf.columns, index=CategoricalIndex(list("AAB")), ) tm.assert_frame_equal(cdf.loc[["A", "B"], :], expect) expect = DataFrame( df.loc[:, ["X", "Y"]], index=cdf.index, columns=CategoricalIndex(list("XXY")), ) tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect) def test_read_only_source(self): # GH 10043 rw_array = np.eye(10) rw_df = DataFrame(rw_array) ro_array = np.eye(10) ro_array.setflags(write=False) ro_df = DataFrame(ro_array) tm.assert_frame_equal(rw_df.iloc[[1, 2, 3]], ro_df.iloc[[1, 2, 3]]) tm.assert_frame_equal(rw_df.iloc[[1]], ro_df.iloc[[1]]) tm.assert_series_equal(rw_df.iloc[1], ro_df.iloc[1]) tm.assert_frame_equal(rw_df.iloc[1:3], ro_df.iloc[1:3]) tm.assert_frame_equal(rw_df.loc[[1, 2, 3]], ro_df.loc[[1, 2, 3]]) tm.assert_frame_equal(rw_df.loc[[1]], ro_df.loc[[1]]) tm.assert_series_equal(rw_df.loc[1], ro_df.loc[1]) tm.assert_frame_equal(rw_df.loc[1:3], ro_df.loc[1:3]) def test_reindexing(self): df = DataFrame({ "A": np.arange(3, dtype="int64"), "B": Series(list("abc")).astype(CDT(list("cabe"))), }).set_index("B") # reindexing # convert to a regular index result = df.reindex(["a", "b", "e"]) expected = DataFrame({ "A": [0, 1, np.nan], "B": Series(list("abe")) }).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(["a", "b"]) expected = DataFrame({ "A": [0, 1], "B": Series(list("ab")) }).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(["e"]) expected = DataFrame({ "A": [np.nan], "B": Series(["e"]) }).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(["d"]) expected = DataFrame({ "A": [np.nan], "B": Series(["d"]) }).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) # since we are actually reindexing with a Categorical # then return a Categorical cats = list("cabe") result = df.reindex(Categorical(["a", "e"], categories=cats)) expected = DataFrame({ "A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats)) }).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(Categorical(["a"], categories=cats)) expected = DataFrame({ "A": [0], "B": Series(list("a")).astype(CDT(cats)) }).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(["a", "b", "e"]) expected = DataFrame({ "A": [0, 1, np.nan], "B": Series(list("abe")) }).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(["a", "b"]) expected = DataFrame({ "A": [0, 1], "B": Series(list("ab")) }).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(["e"]) expected = DataFrame({ "A": [np.nan], "B": Series(["e"]) }).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) # give back the type of categorical that we received result = df.reindex( Categorical(["a", "e"], categories=cats, ordered=True)) expected = DataFrame({ "A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True)) }).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(Categorical(["a", "d"], categories=["a", "d"])) expected = DataFrame({ "A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"])) }).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) # passed duplicate indexers are not allowed msg = "cannot reindex from a duplicate axis" with pytest.raises(ValueError, match=msg): self.df2.reindex(["a", "b"]) # args NotImplemented ATM msg = r"argument {} is not implemented for CategoricalIndex\.reindex" with pytest.raises(NotImplementedError, match=msg.format("method")): df.reindex(["a"], method="ffill") with pytest.raises(NotImplementedError, match=msg.format("level")): df.reindex(["a"], level=1) with pytest.raises(NotImplementedError, match=msg.format("limit")): df.reindex(["a"], limit=2) def test_loc_slice(self): # slicing # not implemented ATM # GH9748 msg = ("cannot do slice indexing on {klass} with these " r"indexers \[1\] of {kind}".format(klass=str(CategoricalIndex), kind=str(int))) with pytest.raises(TypeError, match=msg): self.df.loc[1:5] # result = df.loc[1:5] # expected = df.iloc[[1,2,3,4]] # tm.assert_frame_equal(result, expected) def test_loc_and_at_with_categorical_index(self): # GH 20629 s = Series([1, 2, 3], index=pd.CategoricalIndex(["A", "B", "C"])) assert s.loc["A"] == 1 assert s.at["A"] == 1 df = DataFrame([[1, 2], [3, 4], [5, 6]], index=pd.CategoricalIndex(["A", "B", "C"])) assert df.loc["B", 1] == 4 assert df.at["B", 1] == 4 def test_boolean_selection(self): df3 = self.df3 df4 = self.df4 result = df3[df3.index == "a"] expected = df3.iloc[[]] tm.assert_frame_equal(result, expected) result = df4[df4.index == "a"] expected = df4.iloc[[]] tm.assert_frame_equal(result, expected) result = df3[df3.index == 1] expected = df3.iloc[[0, 1, 3]] tm.assert_frame_equal(result, expected) result = df4[df4.index == 1] expected = df4.iloc[[0, 1, 3]] tm.assert_frame_equal(result, expected) # since we have an ordered categorical # CategoricalIndex([1, 1, 2, 1, 3, 2], # categories=[3, 2, 1], # ordered=True, # name='B') result = df3[df3.index < 2] expected = df3.iloc[[4]] tm.assert_frame_equal(result, expected) result = df3[df3.index > 1] expected = df3.iloc[[]] tm.assert_frame_equal(result, expected) # unordered # cannot be compared # CategoricalIndex([1, 1, 2, 1, 3, 2], # categories=[3, 2, 1], # ordered=False, # name='B') msg = "Unordered Categoricals can only compare equality or not" with pytest.raises(TypeError, match=msg): df4[df4.index < 2] with pytest.raises(TypeError, match=msg): df4[df4.index > 1] def test_indexing_with_category(self): # https://github.com/pandas-dev/pandas/issues/12564 # consistent result if comparing as Dataframe cat = DataFrame({"A": ["foo", "bar", "baz"]}) exp = DataFrame({"A": [True, False, False]}) res = cat[["A"]] == "foo" tm.assert_frame_equal(res, exp) cat["A"] = cat["A"].astype("category") res = cat[["A"]] == "foo" tm.assert_frame_equal(res, exp) def test_map_with_dict_or_series(self): orig_values = ["a", "B", 1, "a"] new_values = ["one", 2, 3.0, "one"] cur_index = pd.CategoricalIndex(orig_values, name="XXX") expected = pd.CategoricalIndex(new_values, name="XXX", categories=[3.0, 2, "one"]) mapper = pd.Series(new_values[:-1], index=orig_values[:-1]) output = cur_index.map(mapper) # Order of categories in output can be different tm.assert_index_equal(expected, output) mapper = {o: n for o, n in zip(orig_values[:-1], new_values[:-1])} output = cur_index.map(mapper) # Order of categories in output can be different tm.assert_index_equal(expected, output) @pytest.mark.parametrize( "idx_values", [ # python types [1, 2, 3], [-1, -2, -3], [1.5, 2.5, 3.5], [-1.5, -2.5, -3.5], # numpy int/uint *[ np.array([1, 2, 3], dtype=dtype) for dtype in conftest.ALL_INT_DTYPES ], # numpy floats *[ np.array([1.5, 2.5, 3.5], dtype=dtyp) for dtyp in conftest.FLOAT_DTYPES ], # numpy object np.array([1, "b", 3.5], dtype=object), # pandas scalars [Interval(1, 4), Interval(4, 6), Interval(6, 9)], [ Timestamp(2019, 1, 1), Timestamp(2019, 2, 1), Timestamp(2019, 3, 1) ], [Timedelta(1, "d"), Timedelta(2, "d"), Timedelta(3, "D")], # pandas Integer arrays *[ pd.array([1, 2, 3], dtype=dtype) for dtype in conftest.ALL_EA_INT_DTYPES ], # other pandas arrays pd.IntervalIndex.from_breaks([1, 4, 6, 9]).array, pd.date_range("2019-01-01", periods=3).array, pd.timedelta_range(start="1d", periods=3).array, ], ) def test_loc_with_non_string_categories(self, idx_values, ordered_fixture): # GH-17569 cat_idx = CategoricalIndex(idx_values, ordered=ordered_fixture) df = DataFrame({"A": ["foo", "bar", "baz"]}, index=cat_idx) # scalar selection result = df.loc[idx_values[0]] expected = Series(["foo"], index=["A"], name=idx_values[0]) tm.assert_series_equal(result, expected) # list selection result = df.loc[idx_values[:2]] expected = DataFrame(["foo", "bar"], index=cat_idx[:2], columns=["A"]) tm.assert_frame_equal(result, expected) # scalar assignment result = df.copy() result.loc[idx_values[0]] = "qux" expected = DataFrame({"A": ["qux", "bar", "baz"]}, index=cat_idx) tm.assert_frame_equal(result, expected) # list assignment result = df.copy() result.loc[idx_values[:2], "A"] = ["qux", "qux2"] expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx) tm.assert_frame_equal(result, expected)
def supervised_classifier(input_SOURCES, test_directory): NEWLINE = '\n' SKIP_FILES = {'cmds'} def read_files(path): for root, dir_names, file_names in os.walk(path): for path in dir_names: read_files(os.path.join(root, path)) for file_name in file_names: if file_name not in SKIP_FILES: file_path = os.path.join(root, file_name) if os.path.isfile(file_path): # past_header, lines = False, [] past_header, lines = True, [] f = open(file_path, errors='ignore') for line in f: if past_header: lines.append(line) elif line == NEWLINE: past_header = True f.close() content = NEWLINE.join(lines) yield file_path, content def build_data_test_frame(path): rows = [] index = [] for file_name, text in read_files(path): rows.append({'text': text}) index.append(file_name) # print("[DEBUG] file_name: {}".format(file_name)) data_frame_test = DataFrame(rows, index=index) return data_frame_test def build_data_frame(path, classification): rows = [] index = [] for file_name, text in read_files(path): # print("[DEBUG] text: {}".format(text)) rows.append({'text': text, 'class': classification}) index.append(file_name) data_frame = DataFrame(rows, index=index) return data_frame # Training # Assigning classes to training set (label training dataset) Path_extracted = Address(1).split("\n") Path_extracted1 = Path_extracted[0] SOURCES = input_SOURCES data = DataFrame({'text': [], 'class': []}) for path, classification in SOURCES: new_data_frame = build_data_frame(path, classification) data = data.append(new_data_frame, sort=True) data = data.reindex(numpy.random.permutation(data.index)) # Naive Bayes classifier count_vectorizer = CountVectorizer( stop_words=None ) # Segmenting text file into words, counting occurrence number of each word and assigning this number as an ID to words for training set # print("[DEBUG] data['text'].values: {}".format(data['text'].values)) counts = count_vectorizer.fit_transform(data['text'].values) # print("[DEBUG] count: {}".format(counts)) # comment the above two lines and apply TF-IDF with the following two line of codes # count_vectorizer = TfidfVectorizer(use_idf=True) # counts = count_vectorizer.fit_transform(data['text'].values) # below is for the MultinomialNB method: classifier = MultinomialNB( ) # Calculating coefficients for training set based on Naive Bayes targets = data['class'].values classifier.fit(counts, targets) # classifier.class_ = ['bad', 'good', 'perfect'] # below is for the SVM method: # classifier = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto', probability=True) # targets = data['class'].values # classifier.fit(counts, targets) # define perfect as the location of class 'perfect' perfect = 0 for i in range(len(classifier.classes_)): if classifier.classes_[i] == 'perfect': perfect = i # # find boundary possibility # SOURCEStrain = [path for path, classification in SOURCES] # data_train = DataFrame({'text': []}) # for path in SOURCEStrain: # data_train = data_train.append(build_data_test_frame(path)) # training_samples = data_train['text'].values # sample_counts = count_vectorizer.transform(training_samples) # all_possibility = classifier.predict_proba(sample_counts) # boundary = numpy.mean(all_possibility) # print(boundary) # judge with SVC clf = LinearSVC() clf.fit(counts, targets) # Testing SOURCEStest = [os.path.join(Path_extracted1, test_directory)] # Testset directory of pages # SOURCEStest = [os.path.join(Path_extracted1,'Test_cropped_text')#Testset directory # (os.path.join(os.path.join(Path_extracted1,'Text_test'), 'Temperature_Sensor'))] # print("source dir here: {}".format(SOURCEStest)) data_test = DataFrame({'text': []}) for path in SOURCEStest: data_test = data_test.append(build_data_test_frame(path)) examples = data_test['text'].values example_counts = count_vectorizer.transform(examples) # print("[DEBUG] example count at path: {},\n{}".format(Path_extracted, example_counts)) # Applying calculated Naive Bayes coefficients and decision based on MAP predictions = classifier.predict(example_counts) # SVC prediction pass_list = clf.predict(example_counts) # # check whether all the predictions are 'bad' # all_bad = 1 # for prediction in predictions: # if not prediction == 'bad': # all_bad = 0 # break # if all_bad == 1: # perfect_prob = [pred_prob[perfect] for pred_prob in classifier.predict_proba(example_counts)] # perfect_page = perfect_prob.index(max(perfect_prob)) # predictions[perfect_page] = 'perfect' perfect_prob = [ pred_prob[perfect] for pred_prob in classifier.predict_proba(example_counts) ] perfect_idx = perfect_prob.index(max(perfect_prob)) if pass_list[perfect_idx] == 'perfect': predictions[perfect_idx] = 'perfect' for i in range(len(predictions)): if not i == perfect_idx: predictions[i] = 'bad' else: for i in range(len(predictions)): predictions[i] = 'bad' # print result in tuples page_classification_result = [] for path in SOURCEStest: for page, label in zip(os.listdir(path), predictions): page_name = page.split(".txt")[0] predicted_tuple = [page_name, label] page_classification_result.append(predicted_tuple) return page_classification_result
ser2 ser3=ser2.reindex(['A','B','C','D','E','F','G'],fill_value=0) #fill_value ser3 ser4 = Series(['USA','Mexico','Canada'],index = [0,5,10]) ser4 ser4.reindex(range(15),method='ffill') # reindexing series # DFs from numpy.random import randn df1 = DataFrame(randn(25).reshape((5,5)), index=['A','B','D','E','F'], # missed C columns=['col1','col2','col3','col4','col5']) df1 df2=df1.reindex(['A','B','C','D','E','F']) #renindexing rows of a df df2 new_columns = ['col1','col2','col3','col4','col5','col6'] df2.reindex(columns=new_columns) #reindexing columns of a df #reindexing with .ix is faster df1 df1.ix[['A','B','C','D','E','F'],new_columns] ### Droping enteries ser1=Series(np.arange(3),index=['a','b','c']) ser1 ser1.drop('b') df1=DataFrame(np.arange(9).reshape([3,3]),index=['SF','LA','NYC'],columns=['pop','size','year'])
def test_dups_fancy_indexing(self): # GH 3455 from pandas.util.testing import makeCustomDataframe as mkdf df = mkdf(10, 3) df.columns = ['a', 'a', 'b'] result = df[['b', 'a']].columns expected = Index(['b', 'a', 'a']) tm.assert_index_equal(result, expected) # across dtypes df = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']], columns=list('aaaaaaa')) df.head() str(df) result = DataFrame([[1, 2, 1., 2., 3., 'foo', 'bar']]) result.columns = list('aaaaaaa') # TODO(wesm): unused? df_v = df.iloc[:, 4] # noqa res_v = result.iloc[:, 4] # noqa tm.assert_frame_equal(df, result) # GH 3561, dups not in selected order df = DataFrame( { 'test': [5, 7, 9, 11], 'test1': [4., 5, 6, 7], 'other': list('abcd') }, index=['A', 'A', 'B', 'C']) rows = ['C', 'B'] expected = DataFrame( { 'test': [11, 9], 'test1': [7., 6], 'other': ['d', 'c'] }, index=rows) result = df.loc[rows] tm.assert_frame_equal(result, expected) result = df.loc[Index(rows)] tm.assert_frame_equal(result, expected) rows = ['C', 'B', 'E'] expected = DataFrame( { 'test': [11, 9, np.nan], 'test1': [7., 6, np.nan], 'other': ['d', 'c', np.nan] }, index=rows) result = df.loc[rows] tm.assert_frame_equal(result, expected) # see GH5553, make sure we use the right indexer rows = ['F', 'G', 'H', 'C', 'B', 'E'] expected = DataFrame( { 'test': [np.nan, np.nan, np.nan, 11, 9, np.nan], 'test1': [np.nan, np.nan, np.nan, 7., 6, np.nan], 'other': [np.nan, np.nan, np.nan, 'd', 'c', np.nan] }, index=rows) result = df.loc[rows] tm.assert_frame_equal(result, expected) # inconsistent returns for unique/duplicate indices when values are # missing df = DataFrame(np.random.randn(4, 3), index=list('ABCD')) expected = df.reindex(['E']) dfnu = DataFrame(np.random.randn(5, 3), index=list('AABCD')) with catch_warnings(record=True): result = dfnu.ix[['E']] tm.assert_frame_equal(result, expected) # ToDo: check_index_type can be True after GH 11497 # GH 4619; duplicate indexer with missing label df = DataFrame({"A": [0, 1, 2]}) result = df.loc[[0, 8, 0]] expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) df = DataFrame({"A": list('abc')}) result = df.loc[[0, 8, 0]] expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0]) tm.assert_frame_equal(result, expected, check_index_type=False) # non unique with non unique selector df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C']) expected = DataFrame({'test': [5, 7, 5, 7, np.nan]}, index=['A', 'A', 'A', 'A', 'E']) result = df.loc[['A', 'A', 'E']] tm.assert_frame_equal(result, expected) # GH 5835 # dups on index and missing values df = DataFrame(np.random.randn(5, 5), columns=['A', 'B', 'B', 'B', 'A']) expected = pd.concat([ df.loc[:, ['A', 'B']], DataFrame(np.nan, columns=['C'], index=df.index) ], axis=1) result = df.loc[:, ['A', 'B', 'C']] tm.assert_frame_equal(result, expected) # GH 6504, multi-axis indexing df = DataFrame(np.random.randn(9, 2), index=[1, 1, 1, 2, 2, 2, 3, 3, 3], columns=['a', 'b']) expected = df.iloc[0:6] result = df.loc[[1, 2]] tm.assert_frame_equal(result, expected) expected = df result = df.loc[:, ['a', 'b']] tm.assert_frame_equal(result, expected) expected = df.iloc[0:6, :] result = df.loc[[1, 2], ['a', 'b']] tm.assert_frame_equal(result, expected)
rows = [] index = [] for file_name, text in read_files(path): rows.append({'text': text, 'class': classification}) index.append(file_name) data_frame = DataFrame(rows, index=index) return data_frame # read the corpus data data = DataFrame({'text': [], 'class': []}) for path, classification in SOURCES: data = data.append(build_data_frame(path, classification)) # randomize corpus data data = data.reindex(numpy.random.permutation(data.index)) # create the data trasformation and classification pipeline # http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html # http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html # http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html pipeline = Pipeline([ ('vect', CountVectorizer(stop_words='english',lowercase=True)), ('tfidf', TfidfTransformer(use_idf=True,smooth_idf=False)), ('clf', MultinomialNB(alpha=1.0,fit_prior=True)) ]) # do k-fold cross-validation # https://en.wikipedia.org/wiki/Cross-validation_(statistics) # http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.KFold.html k_fold = KFold(n=len(data), n_folds=6)