def test_merge_index_singlekey_right_vs_left(self): left = DataFrame({ 'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], 'v1': np.random.randn(7) }) right = DataFrame({'v2': np.random.randn(4)}, index=['d', 'b', 'c', 'a']) merged1 = merge(left, right, left_on='key', right_index=True, how='left', sort=False) merged2 = merge(right, left, right_on='key', left_index=True, how='right', sort=False) assert_frame_equal(merged1, merged2.ix[:, merged1.columns]) merged1 = merge(left, right, left_on='key', right_index=True, how='left', sort=True) merged2 = merge(right, left, right_on='key', left_index=True, how='right', sort=True) assert_frame_equal(merged1, merged2.ix[:, merged1.columns])
def test_handle_join_key_pass_array(self): left = DataFrame({"key": [1, 1, 2, 2, 3], "value": range(5)}, columns=["value", "key"]) right = DataFrame({"rvalue": range(6)}) key = np.array([1, 1, 2, 3, 4, 5]) merged = merge(left, right, left_on="key", right_on=key, how="outer") merged2 = merge(right, left, left_on=key, right_on="key", how="outer") assert_series_equal(merged["key"], merged2["key"]) self.assert_(merged["key"].notnull().all()) self.assert_(merged2["key"].notnull().all()) left = DataFrame({"value": range(5)}, columns=["value"]) right = DataFrame({"rvalue": range(6)}) lkey = np.array([1, 1, 2, 2, 3]) rkey = np.array([1, 1, 2, 3, 4, 5]) merged = merge(left, right, left_on=lkey, right_on=rkey, how="outer") self.assert_(np.array_equal(merged["key_0"], np.array([1, 1, 1, 1, 2, 2, 3, 4, 5]))) left = DataFrame({"value": range(3)}) right = DataFrame({"rvalue": range(6)}) key = np.array([0, 1, 1, 2, 2, 3]) merged = merge(left, right, left_index=True, right_on=key, how="outer") self.assert_(np.array_equal(merged["key_0"], key))
def plotMain(logFile, outputLabels = None): if outputLabels is None: train_dict_list, train_dict_names, test_dict_list, test_dict_names = logparser.parse_log(logFile) dfTraining = pd.DataFrame(train_dict_list, columns=['NumIters', 'LearningRate', 'TrainingLoss']) dfTest = pd.DataFrame(test_dict_list, columns=['Seconds', 'NumIters', 'TestLoss', 'TestAccuracy']) df = merge(dfTraining, dfTest, how='inner', on='NumIters') df.plot(x='NumIters', y=['TrainingLoss', 'TestLoss']) plt.show() df.plot(x='NumIters', y=['TestAccuracy']) plt.show() rowMax = df['TestAccuracy'].argmax() print df.loc[[rowMax]] dfSubCondition = df['NumIters'] % 10000 == 0 dfSub = df[dfSubCondition] rowMax = dfSub['TestAccuracy'].argmax() print dfSub.loc[[rowMax]] else: df = pd.DataFrame() for lbl in outputLabels: train_dict_list, train_dict_names, test_dict_list, test_dict_names = logparser.parse_log(logFile, lbl) dfTrainingCurrent = pd.DataFrame(train_dict_list, columns=['NumIters', 'LearningRate', 'TrainingLoss']) dfTestCurrent = pd.DataFrame(test_dict_list, columns=['Seconds', 'NumIters', 'TestLoss', 'TestAccuracy']) mergedCurrent = merge(dfTrainingCurrent, dfTestCurrent, how='inner', on='NumIters') if 'NumIters' in df: df = merge(df, mergedCurrent, how='inner', on='NumIters') df['TrainingLoss'] = df['TrainingLoss_x'] + df['TrainingLoss_y'] df['TestLoss'] = df['TestLoss_x'] + df['TestLoss_y'] df = df.drop(['TrainingLoss_x', 'TrainingLoss_y', 'TestLoss_x', 'TestLoss_y'], 1) else: df = mergedCurrent df.plot(x='NumIters', y=['TrainingLoss', 'TestLoss']) plt.show()
def test_inner_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='inner') _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner') joined_both = merge(self.df, self.df2, how='inner') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='inner')
def test_handle_join_key_pass_array(self): left = DataFrame({'key' : [1, 1, 2, 2, 3], 'value' : range(5)}, columns=['value', 'key']) right = DataFrame({'rvalue' : range(6)}) key = np.array([1, 1, 2, 3, 4, 5]) merged = merge(left, right, left_on='key', right_on=key, how='outer') merged2 = merge(right, left, left_on=key, right_on='key', how='outer') assert_series_equal(merged['key'], merged2['key']) self.assert_(merged['key'].notnull().all()) self.assert_(merged2['key'].notnull().all()) left = DataFrame({'value' : range(5)}, columns=['value']) right = DataFrame({'rvalue' : range(6)}) lkey = np.array([1, 1, 2, 2, 3]) rkey = np.array([1, 1, 2, 3, 4, 5]) merged = merge(left, right, left_on=lkey, right_on=rkey, how='outer') self.assert_(np.array_equal(merged['key_0'], np.array([1, 1, 1, 1, 2, 2, 3, 4, 5]))) left = DataFrame({'value': range(3)}) right = DataFrame({'rvalue' : range(6)}) key = np.array([0, 1, 1, 2, 2, 3]) merged = merge(left, right, left_index=True, right_on=key, how='outer') self.assert_(np.array_equal(merged['key_0'], key))
def test_join_inner_multiindex(self): key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"] key2 = ["two", "one", "three", "one", "two", "one", "two", "two", "three", "one"] data = np.random.randn(len(key1)) data = DataFrame({"key1": key1, "key2": key2, "data": data}) index = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["first", "second"], ) to_join = DataFrame(np.random.randn(10, 3), index=index, columns=["j_one", "j_two", "j_three"]) joined = data.join(to_join, on=["key1", "key2"], how="inner") expected = merge( data, to_join.reset_index(), left_on=["key1", "key2"], right_on=["first", "second"], how="inner", sort=False ) expected2 = merge(to_join, data, right_on=["key1", "key2"], left_index=True, how="inner", sort=False) assert_frame_equal(joined, expected2.reindex_like(joined)) expected2 = merge(to_join, data, right_on=["key1", "key2"], left_index=True, how="inner", sort=False) expected = expected.drop(["first", "second"], axis=1) expected.index = joined.index self.assert_(joined.index.is_monotonic) assert_frame_equal(joined, expected)
def test_right_outer_join(self): joined_key2 = merge(self.df, self.df2, on='key2', how='right') _check_join(self.df, self.df2, joined_key2, ['key2'], how='right') joined_both = merge(self.df, self.df2, how='right') _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], how='right')
def test_handle_join_key_pass_array(self): left = DataFrame({ 'key': [1, 1, 2, 2, 3], 'value': range(5) }, columns=['value', 'key']) right = DataFrame({'rvalue': range(6)}) key = np.array([1, 1, 2, 3, 4, 5]) merged = merge(left, right, left_on='key', right_on=key, how='outer') merged2 = merge(right, left, left_on=key, right_on='key', how='outer') assert_series_equal(merged['key'], merged2['key']) self.assert_(merged['key'].notnull().all()) self.assert_(merged2['key'].notnull().all()) left = DataFrame({'value': range(5)}, columns=['value']) right = DataFrame({'rvalue': range(6)}) lkey = np.array([1, 1, 2, 2, 3]) rkey = np.array([1, 1, 2, 3, 4, 5]) merged = merge(left, right, left_on=lkey, right_on=rkey, how='outer') self.assert_( np.array_equal(merged['key_0'], np.array([1, 1, 1, 1, 2, 2, 3, 4, 5]))) left = DataFrame({'value': range(3)}) right = DataFrame({'rvalue': range(6)}) key = np.array([0, 1, 1, 2, 2, 3]) merged = merge(left, right, left_index=True, right_on=key, how='outer') self.assert_(np.array_equal(merged['key_0'], key))
def test_join_on_fails_with_different_column_counts(self): with tm.assertRaises(ValueError): df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), 'b': np.random.randn(3)}) df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), 'b': np.random.randn(10)}, index=tm.makeCustomIndex(10, 2)) merge(df, df2, right_on='a', left_on=['a', 'b'])
def test_join_many_non_unique_index(self): df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]}) df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]}) df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]}) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how='outer') df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer') expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer') result = result.reset_index() expected = expected[result.columns] expected['a'] = expected.a.astype('int64') expected['b'] = expected.b.astype('int64') assert_frame_equal(result, expected) df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) df3 = DataFrame({ "a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000] }) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how='inner') df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner') expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner') result = result.reset_index() assert_frame_equal(result, expected.ix[:, result.columns]) # GH 11519 df = DataFrame({ 'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8) }) s = Series(np.repeat(np.arange(8), 2), index=np.repeat(np.arange(8), 2), name='TEST') inner = df.join(s, how='inner') outer = df.join(s, how='outer') left = df.join(s, how='left') right = df.join(s, how='right') assert_frame_equal(inner, outer) assert_frame_equal(inner, left) assert_frame_equal(inner, right)
def test_join_on_fails_with_wrong_object_type(self): # GH12081 wrongly_typed = [Series([0, 1]), 2, 'str', None, np.array([0, 1])] df = DataFrame({'a': [1, 1]}) for obj in wrongly_typed: with tm.assertRaisesRegexp(ValueError, str(type(obj))): merge(obj, df, left_on='a', right_on='a') with tm.assertRaisesRegexp(ValueError, str(type(obj))): merge(df, obj, left_on='a', right_on='a')
def test_merge_index_singlekey_right_vs_left(self): left = DataFrame({"key": ["a", "b", "c", "d", "e", "e", "a"], "v1": np.random.randn(7)}) right = DataFrame({"v2": np.random.randn(4)}, index=["d", "b", "c", "a"]) merged1 = merge(left, right, left_on="key", right_index=True, how="left", sort=False) merged2 = merge(right, left, right_on="key", left_index=True, how="right", sort=False) assert_frame_equal(merged1, merged2.ix[:, merged1.columns]) merged1 = merge(left, right, left_on="key", right_index=True, how="left", sort=True) merged2 = merge(right, left, right_on="key", left_index=True, how="right", sort=True) assert_frame_equal(merged1, merged2.ix[:, merged1.columns])
def test_merge_index_singlekey_right_vs_left(self): left = DataFrame({'key' : ['a', 'b', 'c', 'd', 'e', 'e', 'a'], 'v1' : np.random.randn(7)}) right = DataFrame({'v2' : np.random.randn(4)}, index=['d', 'b', 'c', 'a']) merged1 = merge(left, right, left_on='key', right_index=True, how='left') merged2 = merge(right, left, right_on='key', left_index=True, how='right') assert_frame_equal(merged1, merged2.ix[:, merged1.columns])
def test_merge_index_singlekey_inner(self): left = DataFrame({"key": ["a", "b", "c", "d", "e", "e", "a"], "v1": np.random.randn(7)}) right = DataFrame({"v2": np.random.randn(4)}, index=["d", "b", "c", "a"]) # inner join result = merge(left, right, left_on="key", right_index=True, how="inner") expected = left.join(right, on="key").ix[result.index] assert_frame_equal(result, expected) result = merge(right, left, right_on="key", left_index=True, how="inner") expected = left.join(right, on="key").ix[result.index] assert_frame_equal(result, expected.ix[:, result.columns])
def test_join_on_fails_with_different_column_counts(self): with tm.assertRaises(ValueError): df = DataFrame({ 'a': np.random.choice(['m', 'f'], size=3), 'b': np.random.randn(3) }) df2 = DataFrame( { 'a': np.random.choice(['m', 'f'], size=10), 'b': np.random.randn(10) }, index=tm.makeCustomIndex(10, 2)) merge(df, df2, right_on='a', left_on=['a', 'b'])
def test_join_many_non_unique_index(self): df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]}) df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]}) df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]}) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how='outer') df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer') expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer') result = result.reset_index() expected = expected[result.columns] expected['a'] = expected.a.astype('int64') expected['b'] = expected.b.astype('int64') assert_frame_equal(result, expected) df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) df3 = DataFrame( {"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]}) idf1 = df1.set_index(["a", "b"]) idf2 = df2.set_index(["a", "b"]) idf3 = df3.set_index(["a", "b"]) result = idf1.join([idf2, idf3], how='inner') df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner') expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner') result = result.reset_index() assert_frame_equal(result, expected.ix[:, result.columns]) # GH 11519 df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8)}) s = Series(np.repeat(np.arange(8), 2), index=np.repeat(np.arange(8), 2), name='TEST') inner = df.join(s, how='inner') outer = df.join(s, how='outer') left = df.join(s, how='left') right = df.join(s, how='right') assert_frame_equal(inner, outer) assert_frame_equal(inner, left) assert_frame_equal(inner, right)
def test_join_inner_multiindex(self): key1 = [ 'bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', 'qux', 'snap' ] key2 = [ 'two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', 'three', 'one' ] data = np.random.randn(len(key1)) data = DataFrame({'key1': key1, 'key2': key2, 'data': data}) index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) to_join = DataFrame(np.random.randn(10, 3), index=index, columns=['j_one', 'j_two', 'j_three']) joined = data.join(to_join, on=['key1', 'key2'], how='inner') expected = merge(data, to_join.reset_index(), left_on=['key1', 'key2'], right_on=['first', 'second'], how='inner', sort=False) expected2 = merge(to_join, data, right_on=['key1', 'key2'], left_index=True, how='inner', sort=False) assert_frame_equal(joined, expected2.reindex_like(joined)) expected2 = merge(to_join, data, right_on=['key1', 'key2'], left_index=True, how='inner', sort=False) expected = expected.drop(['first', 'second'], axis=1) expected.index = joined.index self.assertTrue(joined.index.is_monotonic) assert_frame_equal(joined, expected)
def plotMain(logFile, outputLabels=None): if outputLabels is None: train_dict_list, train_dict_names, test_dict_list, test_dict_names = logparser.parse_log( logFile) dfTraining = pd.DataFrame( train_dict_list, columns=['NumIters', 'LearningRate', 'TrainingLoss']) dfTest = pd.DataFrame( test_dict_list, columns=['Seconds', 'NumIters', 'TestLoss', 'TestAccuracy']) df = merge(dfTraining, dfTest, how='inner', on='NumIters') df.plot(x='NumIters', y=['TrainingLoss', 'TestLoss']) plt.show() df.plot(x='NumIters', y=['TestAccuracy']) plt.show() rowMax = df['TestAccuracy'].argmax() print df.loc[[rowMax]] dfSubCondition = df['NumIters'] % 10000 == 0 dfSub = df[dfSubCondition] rowMax = dfSub['TestAccuracy'].argmax() print dfSub.loc[[rowMax]] else: df = pd.DataFrame() for lbl in outputLabels: train_dict_list, train_dict_names, test_dict_list, test_dict_names = logparser.parse_log( logFile, lbl) dfTrainingCurrent = pd.DataFrame( train_dict_list, columns=['NumIters', 'LearningRate', 'TrainingLoss']) dfTestCurrent = pd.DataFrame( test_dict_list, columns=['Seconds', 'NumIters', 'TestLoss', 'TestAccuracy']) mergedCurrent = merge(dfTrainingCurrent, dfTestCurrent, how='inner', on='NumIters') if 'NumIters' in df: df = merge(df, mergedCurrent, how='inner', on='NumIters') df['TrainingLoss'] = df['TrainingLoss_x'] + df['TrainingLoss_y'] df['TestLoss'] = df['TestLoss_x'] + df['TestLoss_y'] df = df.drop([ 'TrainingLoss_x', 'TrainingLoss_y', 'TestLoss_x', 'TestLoss_y' ], 1) else: df = mergedCurrent df.plot(x='NumIters', y=['TrainingLoss', 'TestLoss']) plt.show()
def test_merge_index_singlekey_inner(self): left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], 'v1': np.random.randn(7)}) right = DataFrame({'v2': np.random.randn(4)}, index=['d', 'b', 'c', 'a']) # inner join result = merge(left, right, left_on='key', right_index=True, how='inner') expected = left.join(right, on='key').ix[result.index] assert_frame_equal(result, expected) result = merge(right, left, right_on='key', left_index=True, how='inner') expected = left.join(right, on='key').ix[result.index] assert_frame_equal(result, expected.ix[:, result.columns])
def test_handle_overlap_arbitrary_key(self): joined = merge(self.df, self.df2, left_on='key2', right_on='key1', suffixes=['.foo', '.bar']) self.assert_('key1.foo' in joined) self.assert_('key2.bar' in joined)
def test_left_join_index_preserve_order(self): left = DataFrame({ 'k1': [0, 1, 2] * 8, 'k2': ['foo', 'bar'] * 12, 'v': np.arange(24) }) index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) right = DataFrame({'v2': [5, 7]}, index=index) result = left.join(right, on=['k1', 'k2']) expected = left.copy() expected['v2'] = np.nan expected['v2'][(expected.k1 == 2) & (expected.k2 == 'bar')] = 5 expected['v2'][(expected.k1 == 1) & (expected.k2 == 'foo')] = 7 tm.assert_frame_equal(result, expected) # do a right join for an extra test joined = merge(right, left, left_index=True, right_on=['k1', 'k2'], how='right') tm.assert_frame_equal(joined.ix[:, expected.columns], expected)
def load_indices(self, tickers, startdate, lags): self.tickers = tickers self.filename = "DATA.csv" self.startdate = startdate self.enddate = datetime.date.today().strftime("%Y%m%d") if os.path.isfile(self.filename): data = pan.DataFrame.from_csv(self.filename) self.dataframe = data else: for ticker in tickers: data = web.get_data_yahoo(ticker, self.startdate, self.enddate) index = ticker + '1change' data[index] = data['Adj Close'].pct_change(1) #remove unused columns and nan row data = data[[index]] data = data[1:] #filter out middle threshold noise #data = data[np.logical_or(data[index] >= threshold, data[index] <= -threshold)] #preprocess data data = data.apply(preprocess) #lag data for i in range(1, lags + 1): label = ticker + "%dlag" % i data[label] = data[index].shift(i) #remove rows used for change calculation data = data[lags + 1:] print data.head(10) if ticker == "%5EGSPC": self.sp = data else: self.sp = merge(self.sp, data, left_index=True, right_index=True) self.dataframe = self.sp self.dataframe.to_csv(self.filename)
def test_handle_overlap_arbitrary_key(self): joined = merge(self.df, self.df2, left_on='key2', right_on='key1', suffixes=['.foo', '.bar']) self.assertIn('key1.foo', joined) self.assertIn('key2.bar', joined)
def test_join_hierarchical_mixed(self): # GH 2024 df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c']) new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]}) other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd']) other_df.set_index('a', inplace=True) # GH 9455, 12219 with tm.assert_produces_warning(UserWarning): result = merge(new_df, other_df, left_index=True, right_index=True) self.assertTrue(('b', 'mean') in result) self.assertTrue('b' in result)
def test_merge_nocopy(self): left = DataFrame({"a": 0, "b": 1}, index=range(10)) right = DataFrame({"c": "foo", "d": "bar"}, index=range(10)) merged = merge(left, right, left_index=True, right_index=True, copy=False) merged["a"] = 6 self.assert_((left["a"] == 6).all()) merged["d"] = "peekaboo" self.assert_((right["d"] == "peekaboo").all())
def test_merge_nocopy(self): left = DataFrame({'a' : 0, 'b' : 1}, index=range(10)) right = DataFrame({'c' : 'foo', 'd' : 'bar'}, index=range(10)) merged = merge(left, right, left_index=True, right_index=True, copy=False) merged['a'] = 6 self.assert_((left['a'] == 6).all()) merged['d'] = 'peekaboo' self.assert_((right['d'] == 'peekaboo').all())
def test_left_merge_na_buglet(self): left = DataFrame( {"id": list("abcde"), "v1": randn(5), "v2": randn(5), "dummy": list("abcde"), "v3": randn(5)}, columns=["id", "v1", "v2", "dummy", "v3"], ) right = DataFrame({"id": ["a", "b", np.nan, np.nan, np.nan], "sv3": [1.234, 5.678, np.nan, np.nan, np.nan]}) merged = merge(left, right, on="id", how="left") rdf = right.drop(["id"], axis=1) expected = left.join(rdf) tm.assert_frame_equal(merged, expected)
def test_join_hierarchical_mixed(self): # GH 2024 df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c']) new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]}) other_df = DataFrame( [(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd']) other_df.set_index('a', inplace=True) # GH 9455, 12219 with tm.assert_produces_warning(UserWarning): result = merge(new_df, other_df, left_index=True, right_index=True) self.assertTrue(('b', 'mean') in result) self.assertTrue('b' in result)
def test_compress_group_combinations(self): # ~ 40000000 possible unique groups key1 = np.array([rands(10) for _ in xrange(10000)], dtype="O") key1 = np.tile(key1, 2) key2 = key1[::-1] df = DataFrame({"key1": key1, "key2": key2, "value1": np.random.randn(20000)}) df2 = DataFrame({"key1": key1[::2], "key2": key2[::2], "value2": np.random.randn(10000)}) # just to hit the label compression code path merged = merge(df, df2, how="outer")
def test_left_merge_na_buglet(self): left = DataFrame({'id': list('abcde'), 'v1': randn(5), 'v2': randn(5), 'dummy' : list('abcde'), 'v3' : randn(5)}, columns=['id', 'v1', 'v2', 'dummy', 'v3']) right = DataFrame({'id' : ['a', 'b', np.nan, np.nan, np.nan], 'sv3' : [1.234, 5.678, np.nan, np.nan, np.nan]}) merged = merge(left, right, on='id', how='left') rdf = right.drop(['id'], axis=1) expected = left.join(rdf) tm.assert_frame_equal(merged, expected)
def test_join_inner_multiindex(self): key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', 'qux', 'snap'] key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', 'three', 'one'] data = np.random.randn(len(key1)) data = DataFrame({'key1': key1, 'key2': key2, 'data': data}) index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) to_join = DataFrame(np.random.randn(10, 3), index=index, columns=['j_one', 'j_two', 'j_three']) joined = data.join(to_join, on=['key1', 'key2'], how='inner') expected = merge(data, to_join.reset_index(), left_on=['key1', 'key2'], right_on=['first', 'second'], how='inner', sort=False) expected2 = merge(to_join, data, right_on=['key1', 'key2'], left_index=True, how='inner', sort=False) assert_frame_equal(joined, expected2.reindex_like(joined)) expected2 = merge(to_join, data, right_on=['key1', 'key2'], left_index=True, how='inner', sort=False) expected = expected.drop(['first', 'second'], axis=1) expected.index = joined.index self.assert_(joined.index.is_monotonic) assert_frame_equal(joined, expected)
def test_compress_group_combinations(self): # ~ 40000000 possible unique groups key1 = np.array([rands(10) for _ in xrange(10000)], dtype='O') key1 = np.tile(key1, 2) key2 = key1[::-1] df = DataFrame({'key1' : key1, 'key2' : key2, 'value1' : np.random.randn(20000)}) df2 = DataFrame({'key1' : key1[::2], 'key2' : key2[::2], 'value2' : np.random.randn(10000)}) # just to hit the label compression code path merged = merge(df, df2, how='outer')
def test_intelligently_handle_join_key(self): # #733, be a bit more 1337 about not returning unconsolidated DataFrame left = DataFrame({'key' : [1, 1, 2, 2, 3], 'value' : range(5)}, columns=['value', 'key']) right = DataFrame({'key' : [1, 1, 2, 3, 4, 5], 'rvalue' : range(6)}) joined = merge(left, right, on='key', how='outer') expected = DataFrame({'key' : [1, 1, 1, 1, 2, 2, 3, 4, 5.], 'value' : np.array([0, 0, 1, 1, 2, 3, 4, np.nan, np.nan]), 'rvalue' : np.array([0, 1, 0, 1, 2, 2, 3, 4, 5])}, columns=['value', 'key', 'rvalue']) assert_frame_equal(joined, expected) self.assert_(joined._data.is_consolidated())
def test_intelligently_handle_join_key(self): # #733, be a bit more 1337 about not returning unconsolidated DataFrame left = DataFrame({"key": [1, 1, 2, 2, 3], "value": range(5)}, columns=["value", "key"]) right = DataFrame({"key": [1, 1, 2, 3, 4, 5], "rvalue": range(6)}) joined = merge(left, right, on="key", how="outer") expected = DataFrame( { "key": [1, 1, 1, 1, 2, 2, 3, 4, 5.0], "value": np.array([0, 0, 1, 1, 2, 3, 4, np.nan, np.nan]), "rvalue": np.array([0, 1, 0, 1, 2, 2, 3, 4, 5]), }, columns=["value", "key", "rvalue"], ) assert_frame_equal(joined, expected) self.assert_(joined._data.is_consolidated())
def test_left_join_index_preserve_order(self): left = DataFrame({"k1": [0, 1, 2] * 8, "k2": ["foo", "bar"] * 12, "v": np.arange(24)}) index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")]) right = DataFrame({"v2": [5, 7]}, index=index) result = left.join(right, on=["k1", "k2"]) expected = left.copy() expected["v2"] = np.nan expected["v2"][(expected.k1 == 2) & (expected.k2 == "bar")] = 5 expected["v2"][(expected.k1 == 1) & (expected.k2 == "foo")] = 7 tm.assert_frame_equal(result, expected) # do a right join for an extra test joined = merge(right, left, left_index=True, right_on=["k1", "k2"], how="right") tm.assert_frame_equal(joined.ix[:, expected.columns], expected)
def test_left_join_index_preserve_order(self): left = DataFrame({'k1' : [0, 1, 2] * 8, 'k2' : ['foo', 'bar'] * 12, 'v' : np.arange(24)}) index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) right = DataFrame({'v2' : [5, 7]}, index=index) result = left.join(right, on=['k1', 'k2']) expected = left.copy() expected['v2'] = np.nan expected['v2'][(expected.k1 == 2) & (expected.k2 == 'bar')] = 5 expected['v2'][(expected.k1 == 1) & (expected.k2 == 'foo')] = 7 tm.assert_frame_equal(result, expected) # do a right join for an extra test joined = merge(right, left, left_index=True, right_on=['k1', 'k2'], how='right') tm.assert_frame_equal(joined.ix[:, expected.columns], expected)
def test_merge_overlap(self): merged = merge(self.left, self.left, on='key') exp_len = (self.left['key'].value_counts() ** 2).sum() self.assertEqual(len(merged), exp_len) self.assert_('v1_x' in merged) self.assert_('v1_y' in merged)
def test_merge_common(self): joined = merge(self.df, self.df2) exp = merge(self.df, self.df2, on=['key1', 'key2']) tm.assert_frame_equal(joined, exp)
def test_merge_overlap(self): merged = merge(self.left, self.left, on='key') exp_len = (self.left['key'].value_counts()**2).sum() self.assertEqual(len(merged), exp_len) self.assert_('v1.x' in merged) self.assert_('v1.y' in merged)
def test_merge_overlap(self): merged = merge(self.left, self.left, on="key") exp_len = (self.left["key"].value_counts() ** 2).sum() self.assertEqual(len(merged), exp_len) self.assert_("v1_x" in merged) self.assert_("v1_y" in merged)
def test_handle_overlap(self): joined = merge(self.df, self.df2, on='key2', suffixes=['.foo', '.bar']) self.assert_('key1.foo' in joined) self.assert_('key1.bar' in joined)
def test_handle_overlap(self): joined = merge(self.df, self.df2, on='key2', suffixes=['.foo', '.bar']) self.assertIn('key1.foo', joined) self.assertIn('key1.bar', joined)