Example #1
0
    def test_merge_index_singlekey_right_vs_left(self):
        left = DataFrame({
            'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'],
            'v1': np.random.randn(7)
        })
        right = DataFrame({'v2': np.random.randn(4)},
                          index=['d', 'b', 'c', 'a'])

        merged1 = merge(left,
                        right,
                        left_on='key',
                        right_index=True,
                        how='left',
                        sort=False)
        merged2 = merge(right,
                        left,
                        right_on='key',
                        left_index=True,
                        how='right',
                        sort=False)
        assert_frame_equal(merged1, merged2.ix[:, merged1.columns])

        merged1 = merge(left,
                        right,
                        left_on='key',
                        right_index=True,
                        how='left',
                        sort=True)
        merged2 = merge(right,
                        left,
                        right_on='key',
                        left_index=True,
                        how='right',
                        sort=True)
        assert_frame_equal(merged1, merged2.ix[:, merged1.columns])
Example #2
0
    def test_handle_join_key_pass_array(self):
        left = DataFrame({"key": [1, 1, 2, 2, 3], "value": range(5)}, columns=["value", "key"])
        right = DataFrame({"rvalue": range(6)})
        key = np.array([1, 1, 2, 3, 4, 5])

        merged = merge(left, right, left_on="key", right_on=key, how="outer")
        merged2 = merge(right, left, left_on=key, right_on="key", how="outer")

        assert_series_equal(merged["key"], merged2["key"])
        self.assert_(merged["key"].notnull().all())
        self.assert_(merged2["key"].notnull().all())

        left = DataFrame({"value": range(5)}, columns=["value"])
        right = DataFrame({"rvalue": range(6)})
        lkey = np.array([1, 1, 2, 2, 3])
        rkey = np.array([1, 1, 2, 3, 4, 5])

        merged = merge(left, right, left_on=lkey, right_on=rkey, how="outer")
        self.assert_(np.array_equal(merged["key_0"], np.array([1, 1, 1, 1, 2, 2, 3, 4, 5])))

        left = DataFrame({"value": range(3)})
        right = DataFrame({"rvalue": range(6)})

        key = np.array([0, 1, 1, 2, 2, 3])
        merged = merge(left, right, left_index=True, right_on=key, how="outer")
        self.assert_(np.array_equal(merged["key_0"], key))
def plotMain(logFile, outputLabels = None):
    if outputLabels is None:
        train_dict_list, train_dict_names, test_dict_list, test_dict_names = logparser.parse_log(logFile)
        dfTraining = pd.DataFrame(train_dict_list, columns=['NumIters', 'LearningRate', 'TrainingLoss'])
        dfTest = pd.DataFrame(test_dict_list, columns=['Seconds', 'NumIters', 'TestLoss', 'TestAccuracy'])
        df = merge(dfTraining, dfTest, how='inner', on='NumIters')
        df.plot(x='NumIters', y=['TrainingLoss', 'TestLoss'])
        plt.show()
        df.plot(x='NumIters', y=['TestAccuracy'])
        plt.show()
        rowMax = df['TestAccuracy'].argmax()
        print df.loc[[rowMax]]
        dfSubCondition = df['NumIters'] % 10000 == 0
        dfSub = df[dfSubCondition]
        rowMax = dfSub['TestAccuracy'].argmax()
        print dfSub.loc[[rowMax]]
    else:
        df = pd.DataFrame()
        for lbl in outputLabels:
            train_dict_list, train_dict_names, test_dict_list, test_dict_names = logparser.parse_log(logFile, lbl)
            dfTrainingCurrent = pd.DataFrame(train_dict_list, columns=['NumIters', 'LearningRate', 'TrainingLoss'])
            dfTestCurrent = pd.DataFrame(test_dict_list, columns=['Seconds', 'NumIters', 'TestLoss', 'TestAccuracy'])
            mergedCurrent = merge(dfTrainingCurrent, dfTestCurrent, how='inner', on='NumIters')
            if 'NumIters' in df:
                df = merge(df, mergedCurrent, how='inner', on='NumIters')
                df['TrainingLoss'] = df['TrainingLoss_x'] + df['TrainingLoss_y']
                df['TestLoss'] = df['TestLoss_x'] + df['TestLoss_y']
                df = df.drop(['TrainingLoss_x', 'TrainingLoss_y', 'TestLoss_x', 'TestLoss_y'], 1)
            else:
                df = mergedCurrent
                       
    df.plot(x='NumIters', y=['TrainingLoss', 'TestLoss'])
    plt.show()
Example #4
0
    def test_inner_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='inner')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner')

        joined_both = merge(self.df, self.df2, how='inner')
        _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
                    how='inner')
Example #5
0
    def test_handle_join_key_pass_array(self):
        left = DataFrame({'key' : [1, 1, 2, 2, 3],
                          'value' : range(5)}, columns=['value', 'key'])
        right = DataFrame({'rvalue' : range(6)})
        key = np.array([1, 1, 2, 3, 4, 5])

        merged = merge(left, right, left_on='key', right_on=key, how='outer')
        merged2 = merge(right, left, left_on=key, right_on='key', how='outer')

        assert_series_equal(merged['key'], merged2['key'])
        self.assert_(merged['key'].notnull().all())
        self.assert_(merged2['key'].notnull().all())

        left = DataFrame({'value' : range(5)}, columns=['value'])
        right = DataFrame({'rvalue' : range(6)})
        lkey = np.array([1, 1, 2, 2, 3])
        rkey = np.array([1, 1, 2, 3, 4, 5])

        merged = merge(left, right, left_on=lkey, right_on=rkey, how='outer')
        self.assert_(np.array_equal(merged['key_0'],
                                    np.array([1, 1, 1, 1, 2, 2, 3, 4, 5])))

        left = DataFrame({'value': range(3)})
        right = DataFrame({'rvalue' : range(6)})

        key = np.array([0, 1, 1, 2, 2, 3])
        merged = merge(left, right, left_index=True, right_on=key, how='outer')
        self.assert_(np.array_equal(merged['key_0'], key))
Example #6
0
    def test_join_inner_multiindex(self):
        key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"]
        key2 = ["two", "one", "three", "one", "two", "one", "two", "two", "three", "one"]

        data = np.random.randn(len(key1))
        data = DataFrame({"key1": key1, "key2": key2, "data": data})

        index = MultiIndex(
            levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
            labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
            names=["first", "second"],
        )
        to_join = DataFrame(np.random.randn(10, 3), index=index, columns=["j_one", "j_two", "j_three"])

        joined = data.join(to_join, on=["key1", "key2"], how="inner")
        expected = merge(
            data, to_join.reset_index(), left_on=["key1", "key2"], right_on=["first", "second"], how="inner", sort=False
        )

        expected2 = merge(to_join, data, right_on=["key1", "key2"], left_index=True, how="inner", sort=False)
        assert_frame_equal(joined, expected2.reindex_like(joined))

        expected2 = merge(to_join, data, right_on=["key1", "key2"], left_index=True, how="inner", sort=False)

        expected = expected.drop(["first", "second"], axis=1)
        expected.index = joined.index

        self.assert_(joined.index.is_monotonic)
        assert_frame_equal(joined, expected)
Example #7
0
    def test_inner_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='inner')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner')

        joined_both = merge(self.df, self.df2, how='inner')
        _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
                    how='inner')
Example #8
0
    def test_right_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='right')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='right')

        joined_both = merge(self.df, self.df2, how='right')
        _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
                    how='right')
Example #9
0
    def test_right_outer_join(self):
        joined_key2 = merge(self.df, self.df2, on='key2', how='right')
        _check_join(self.df, self.df2, joined_key2, ['key2'], how='right')

        joined_both = merge(self.df, self.df2, how='right')
        _check_join(self.df, self.df2, joined_both, ['key1', 'key2'],
                    how='right')
Example #10
0
    def test_handle_join_key_pass_array(self):
        left = DataFrame({
            'key': [1, 1, 2, 2, 3],
            'value': range(5)
        },
                         columns=['value', 'key'])
        right = DataFrame({'rvalue': range(6)})
        key = np.array([1, 1, 2, 3, 4, 5])

        merged = merge(left, right, left_on='key', right_on=key, how='outer')
        merged2 = merge(right, left, left_on=key, right_on='key', how='outer')

        assert_series_equal(merged['key'], merged2['key'])
        self.assert_(merged['key'].notnull().all())
        self.assert_(merged2['key'].notnull().all())

        left = DataFrame({'value': range(5)}, columns=['value'])
        right = DataFrame({'rvalue': range(6)})
        lkey = np.array([1, 1, 2, 2, 3])
        rkey = np.array([1, 1, 2, 3, 4, 5])

        merged = merge(left, right, left_on=lkey, right_on=rkey, how='outer')
        self.assert_(
            np.array_equal(merged['key_0'],
                           np.array([1, 1, 1, 1, 2, 2, 3, 4, 5])))

        left = DataFrame({'value': range(3)})
        right = DataFrame({'rvalue': range(6)})

        key = np.array([0, 1, 1, 2, 2, 3])
        merged = merge(left, right, left_index=True, right_on=key, how='outer')
        self.assert_(np.array_equal(merged['key_0'], key))
Example #11
0
 def test_join_on_fails_with_different_column_counts(self):
     with tm.assertRaises(ValueError):
         df = DataFrame({'a': np.random.choice(['m', 'f'], size=3),
                         'b': np.random.randn(3)})
         df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10),
                          'b': np.random.randn(10)},
                         index=tm.makeCustomIndex(10, 2))
         merge(df, df2, right_on='a', left_on=['a', 'b'])
Example #12
0
    def test_join_many_non_unique_index(self):
        df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
        df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
        df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])

        result = idf1.join([idf2, idf3], how='outer')

        df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer')
        expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer')

        result = result.reset_index()
        expected = expected[result.columns]
        expected['a'] = expected.a.astype('int64')
        expected['b'] = expected.b.astype('int64')
        assert_frame_equal(result, expected)

        df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
        df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
        df3 = DataFrame({
            "a": [1, 1, 1],
            "b": [1, 1, 2],
            "e": [1000, 2000, 3000]
        })
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])
        result = idf1.join([idf2, idf3], how='inner')

        df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner')
        expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner')

        result = result.reset_index()

        assert_frame_equal(result, expected.ix[:, result.columns])

        # GH 11519
        df = DataFrame({
            'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
            'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
            'C':
            np.random.randn(8),
            'D':
            np.random.randn(8)
        })
        s = Series(np.repeat(np.arange(8), 2),
                   index=np.repeat(np.arange(8), 2),
                   name='TEST')
        inner = df.join(s, how='inner')
        outer = df.join(s, how='outer')
        left = df.join(s, how='left')
        right = df.join(s, how='right')
        assert_frame_equal(inner, outer)
        assert_frame_equal(inner, left)
        assert_frame_equal(inner, right)
Example #13
0
    def test_join_on_fails_with_wrong_object_type(self):
        # GH12081
        wrongly_typed = [Series([0, 1]), 2, 'str', None, np.array([0, 1])]
        df = DataFrame({'a': [1, 1]})

        for obj in wrongly_typed:
            with tm.assertRaisesRegexp(ValueError, str(type(obj))):
                merge(obj, df, left_on='a', right_on='a')
            with tm.assertRaisesRegexp(ValueError, str(type(obj))):
                merge(df, obj, left_on='a', right_on='a')
Example #14
0
    def test_join_on_fails_with_wrong_object_type(self):
        # GH12081
        wrongly_typed = [Series([0, 1]), 2, 'str', None, np.array([0, 1])]
        df = DataFrame({'a': [1, 1]})

        for obj in wrongly_typed:
            with tm.assertRaisesRegexp(ValueError, str(type(obj))):
                merge(obj, df, left_on='a', right_on='a')
            with tm.assertRaisesRegexp(ValueError, str(type(obj))):
                merge(df, obj, left_on='a', right_on='a')
Example #15
0
    def test_merge_index_singlekey_right_vs_left(self):
        left = DataFrame({"key": ["a", "b", "c", "d", "e", "e", "a"], "v1": np.random.randn(7)})
        right = DataFrame({"v2": np.random.randn(4)}, index=["d", "b", "c", "a"])

        merged1 = merge(left, right, left_on="key", right_index=True, how="left", sort=False)
        merged2 = merge(right, left, right_on="key", left_index=True, how="right", sort=False)
        assert_frame_equal(merged1, merged2.ix[:, merged1.columns])

        merged1 = merge(left, right, left_on="key", right_index=True, how="left", sort=True)
        merged2 = merge(right, left, right_on="key", left_index=True, how="right", sort=True)
        assert_frame_equal(merged1, merged2.ix[:, merged1.columns])
Example #16
0
    def test_merge_index_singlekey_right_vs_left(self):
        left = DataFrame({'key' : ['a', 'b', 'c', 'd', 'e', 'e', 'a'],
                          'v1' : np.random.randn(7)})
        right = DataFrame({'v2' : np.random.randn(4)},
                           index=['d', 'b', 'c', 'a'])

        merged1 = merge(left, right, left_on='key',
                        right_index=True, how='left')
        merged2 = merge(right, left, right_on='key',
                        left_index=True, how='right')
        assert_frame_equal(merged1, merged2.ix[:, merged1.columns])
Example #17
0
    def test_merge_index_singlekey_inner(self):
        left = DataFrame({"key": ["a", "b", "c", "d", "e", "e", "a"], "v1": np.random.randn(7)})
        right = DataFrame({"v2": np.random.randn(4)}, index=["d", "b", "c", "a"])

        # inner join
        result = merge(left, right, left_on="key", right_index=True, how="inner")
        expected = left.join(right, on="key").ix[result.index]
        assert_frame_equal(result, expected)

        result = merge(right, left, right_on="key", left_index=True, how="inner")
        expected = left.join(right, on="key").ix[result.index]
        assert_frame_equal(result, expected.ix[:, result.columns])
Example #18
0
 def test_join_on_fails_with_different_column_counts(self):
     with tm.assertRaises(ValueError):
         df = DataFrame({
             'a': np.random.choice(['m', 'f'], size=3),
             'b': np.random.randn(3)
         })
         df2 = DataFrame(
             {
                 'a': np.random.choice(['m', 'f'], size=10),
                 'b': np.random.randn(10)
             },
             index=tm.makeCustomIndex(10, 2))
         merge(df, df2, right_on='a', left_on=['a', 'b'])
Example #19
0
    def test_join_many_non_unique_index(self):
        df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
        df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
        df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])

        result = idf1.join([idf2, idf3], how='outer')

        df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer')
        expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer')

        result = result.reset_index()
        expected = expected[result.columns]
        expected['a'] = expected.a.astype('int64')
        expected['b'] = expected.b.astype('int64')
        assert_frame_equal(result, expected)

        df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
        df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
        df3 = DataFrame(
            {"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]})
        idf1 = df1.set_index(["a", "b"])
        idf2 = df2.set_index(["a", "b"])
        idf3 = df3.set_index(["a", "b"])
        result = idf1.join([idf2, idf3], how='inner')

        df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner')
        expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner')

        result = result.reset_index()

        assert_frame_equal(result, expected.ix[:, result.columns])

        # GH 11519
        df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                              'foo', 'bar', 'foo', 'foo'],
                        'B': ['one', 'one', 'two', 'three',
                              'two', 'two', 'one', 'three'],
                        'C': np.random.randn(8),
                        'D': np.random.randn(8)})
        s = Series(np.repeat(np.arange(8), 2),
                   index=np.repeat(np.arange(8), 2), name='TEST')
        inner = df.join(s, how='inner')
        outer = df.join(s, how='outer')
        left = df.join(s, how='left')
        right = df.join(s, how='right')
        assert_frame_equal(inner, outer)
        assert_frame_equal(inner, left)
        assert_frame_equal(inner, right)
Example #20
0
    def test_join_inner_multiindex(self):
        key1 = [
            'bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', 'qux',
            'snap'
        ]
        key2 = [
            'two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', 'three',
            'one'
        ]

        data = np.random.randn(len(key1))
        data = DataFrame({'key1': key1, 'key2': key2, 'data': data})

        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['first', 'second'])
        to_join = DataFrame(np.random.randn(10, 3),
                            index=index,
                            columns=['j_one', 'j_two', 'j_three'])

        joined = data.join(to_join, on=['key1', 'key2'], how='inner')
        expected = merge(data,
                         to_join.reset_index(),
                         left_on=['key1', 'key2'],
                         right_on=['first', 'second'],
                         how='inner',
                         sort=False)

        expected2 = merge(to_join,
                          data,
                          right_on=['key1', 'key2'],
                          left_index=True,
                          how='inner',
                          sort=False)
        assert_frame_equal(joined, expected2.reindex_like(joined))

        expected2 = merge(to_join,
                          data,
                          right_on=['key1', 'key2'],
                          left_index=True,
                          how='inner',
                          sort=False)

        expected = expected.drop(['first', 'second'], axis=1)
        expected.index = joined.index

        self.assertTrue(joined.index.is_monotonic)
        assert_frame_equal(joined, expected)
Example #21
0
def plotMain(logFile, outputLabels=None):
    if outputLabels is None:
        train_dict_list, train_dict_names, test_dict_list, test_dict_names = logparser.parse_log(
            logFile)
        dfTraining = pd.DataFrame(
            train_dict_list,
            columns=['NumIters', 'LearningRate', 'TrainingLoss'])
        dfTest = pd.DataFrame(
            test_dict_list,
            columns=['Seconds', 'NumIters', 'TestLoss', 'TestAccuracy'])
        df = merge(dfTraining, dfTest, how='inner', on='NumIters')
        df.plot(x='NumIters', y=['TrainingLoss', 'TestLoss'])
        plt.show()
        df.plot(x='NumIters', y=['TestAccuracy'])
        plt.show()
        rowMax = df['TestAccuracy'].argmax()
        print df.loc[[rowMax]]
        dfSubCondition = df['NumIters'] % 10000 == 0
        dfSub = df[dfSubCondition]
        rowMax = dfSub['TestAccuracy'].argmax()
        print dfSub.loc[[rowMax]]
    else:
        df = pd.DataFrame()
        for lbl in outputLabels:
            train_dict_list, train_dict_names, test_dict_list, test_dict_names = logparser.parse_log(
                logFile, lbl)
            dfTrainingCurrent = pd.DataFrame(
                train_dict_list,
                columns=['NumIters', 'LearningRate', 'TrainingLoss'])
            dfTestCurrent = pd.DataFrame(
                test_dict_list,
                columns=['Seconds', 'NumIters', 'TestLoss', 'TestAccuracy'])
            mergedCurrent = merge(dfTrainingCurrent,
                                  dfTestCurrent,
                                  how='inner',
                                  on='NumIters')
            if 'NumIters' in df:
                df = merge(df, mergedCurrent, how='inner', on='NumIters')
                df['TrainingLoss'] = df['TrainingLoss_x'] + df['TrainingLoss_y']
                df['TestLoss'] = df['TestLoss_x'] + df['TestLoss_y']
                df = df.drop([
                    'TrainingLoss_x', 'TrainingLoss_y', 'TestLoss_x',
                    'TestLoss_y'
                ], 1)
            else:
                df = mergedCurrent

    df.plot(x='NumIters', y=['TrainingLoss', 'TestLoss'])
    plt.show()
Example #22
0
    def test_merge_index_singlekey_inner(self):
        left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'],
                          'v1': np.random.randn(7)})
        right = DataFrame({'v2': np.random.randn(4)},
                           index=['d', 'b', 'c', 'a'])

        # inner join
        result = merge(left, right, left_on='key', right_index=True,
                       how='inner')
        expected = left.join(right, on='key').ix[result.index]
        assert_frame_equal(result, expected)

        result = merge(right, left, right_on='key', left_index=True,
                       how='inner')
        expected = left.join(right, on='key').ix[result.index]
        assert_frame_equal(result, expected.ix[:, result.columns])
Example #23
0
    def test_handle_overlap_arbitrary_key(self):
        joined = merge(self.df, self.df2,
                       left_on='key2', right_on='key1',
                       suffixes=['.foo', '.bar'])

        self.assert_('key1.foo' in joined)
        self.assert_('key2.bar' in joined)
Example #24
0
    def test_merge_index_singlekey_inner(self):
        left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'],
                          'v1': np.random.randn(7)})
        right = DataFrame({'v2': np.random.randn(4)},
                           index=['d', 'b', 'c', 'a'])

        # inner join
        result = merge(left, right, left_on='key', right_index=True,
                       how='inner')
        expected = left.join(right, on='key').ix[result.index]
        assert_frame_equal(result, expected)

        result = merge(right, left, right_on='key', left_index=True,
                       how='inner')
        expected = left.join(right, on='key').ix[result.index]
        assert_frame_equal(result, expected.ix[:, result.columns])
Example #25
0
    def test_left_join_index_preserve_order(self):

        left = DataFrame({
            'k1': [0, 1, 2] * 8,
            'k2': ['foo', 'bar'] * 12,
            'v': np.arange(24)
        })

        index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
        right = DataFrame({'v2': [5, 7]}, index=index)

        result = left.join(right, on=['k1', 'k2'])

        expected = left.copy()
        expected['v2'] = np.nan
        expected['v2'][(expected.k1 == 2) & (expected.k2 == 'bar')] = 5
        expected['v2'][(expected.k1 == 1) & (expected.k2 == 'foo')] = 7

        tm.assert_frame_equal(result, expected)

        # do a right join for an extra test
        joined = merge(right,
                       left,
                       left_index=True,
                       right_on=['k1', 'k2'],
                       how='right')
        tm.assert_frame_equal(joined.ix[:, expected.columns], expected)
Example #26
0
 def load_indices(self, tickers, startdate, lags):
     self.tickers = tickers
     self.filename = "DATA.csv"
     self.startdate = startdate
     self.enddate = datetime.date.today().strftime("%Y%m%d")
     if os.path.isfile(self.filename):
         data = pan.DataFrame.from_csv(self.filename)
         self.dataframe = data
     else:
         for ticker in tickers:
             data = web.get_data_yahoo(ticker, self.startdate, self.enddate)
             index = ticker + '1change'
             data[index] = data['Adj Close'].pct_change(1)
             #remove unused columns and nan row
             data = data[[index]]
             data = data[1:]
             #filter out middle threshold noise
             #data = data[np.logical_or(data[index] >= threshold, data[index] <= -threshold)]
             #preprocess data
             data = data.apply(preprocess)
             #lag data
             for i in range(1, lags + 1):
                 label = ticker + "%dlag" % i
                 data[label] = data[index].shift(i)
             #remove rows used for change calculation
             data = data[lags + 1:]
             print data.head(10)
             if ticker == "%5EGSPC":
                 self.sp = data
             else:
                 self.sp = merge(self.sp, data, left_index=True, right_index=True)
         self.dataframe = self.sp
         self.dataframe.to_csv(self.filename)
Example #27
0
 def test_handle_overlap_arbitrary_key(self):
     joined = merge(self.df,
                    self.df2,
                    left_on='key2',
                    right_on='key1',
                    suffixes=['.foo', '.bar'])
     self.assertIn('key1.foo', joined)
     self.assertIn('key2.bar', joined)
Example #28
0
 def test_join_hierarchical_mixed(self):
     # GH 2024
     df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c'])
     new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]})
     other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd'])
     other_df.set_index('a', inplace=True)
     # GH 9455, 12219
     with tm.assert_produces_warning(UserWarning):
         result = merge(new_df, other_df, left_index=True, right_index=True)
     self.assertTrue(('b', 'mean') in result)
     self.assertTrue('b' in result)
Example #29
0
    def test_merge_nocopy(self):
        left = DataFrame({"a": 0, "b": 1}, index=range(10))
        right = DataFrame({"c": "foo", "d": "bar"}, index=range(10))

        merged = merge(left, right, left_index=True, right_index=True, copy=False)

        merged["a"] = 6
        self.assert_((left["a"] == 6).all())

        merged["d"] = "peekaboo"
        self.assert_((right["d"] == "peekaboo").all())
Example #30
0
    def test_merge_nocopy(self):
        left = DataFrame({'a' : 0, 'b' : 1}, index=range(10))
        right = DataFrame({'c' : 'foo', 'd' : 'bar'}, index=range(10))

        merged = merge(left, right, left_index=True,
                       right_index=True, copy=False)

        merged['a'] = 6
        self.assert_((left['a'] == 6).all())

        merged['d'] = 'peekaboo'
        self.assert_((right['d'] == 'peekaboo').all())
Example #31
0
    def test_left_merge_na_buglet(self):
        left = DataFrame(
            {"id": list("abcde"), "v1": randn(5), "v2": randn(5), "dummy": list("abcde"), "v3": randn(5)},
            columns=["id", "v1", "v2", "dummy", "v3"],
        )
        right = DataFrame({"id": ["a", "b", np.nan, np.nan, np.nan], "sv3": [1.234, 5.678, np.nan, np.nan, np.nan]})

        merged = merge(left, right, on="id", how="left")

        rdf = right.drop(["id"], axis=1)
        expected = left.join(rdf)
        tm.assert_frame_equal(merged, expected)
Example #32
0
 def test_join_hierarchical_mixed(self):
     # GH 2024
     df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c'])
     new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]})
     other_df = DataFrame(
         [(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd'])
     other_df.set_index('a', inplace=True)
     # GH 9455, 12219
     with tm.assert_produces_warning(UserWarning):
         result = merge(new_df, other_df, left_index=True, right_index=True)
     self.assertTrue(('b', 'mean') in result)
     self.assertTrue('b' in result)
Example #33
0
    def test_merge_nocopy(self):
        left = DataFrame({'a' : 0, 'b' : 1}, index=range(10))
        right = DataFrame({'c' : 'foo', 'd' : 'bar'}, index=range(10))

        merged = merge(left, right, left_index=True,
                       right_index=True, copy=False)

        merged['a'] = 6
        self.assert_((left['a'] == 6).all())

        merged['d'] = 'peekaboo'
        self.assert_((right['d'] == 'peekaboo').all())
Example #34
0
    def test_compress_group_combinations(self):

        # ~ 40000000 possible unique groups
        key1 = np.array([rands(10) for _ in xrange(10000)], dtype="O")
        key1 = np.tile(key1, 2)
        key2 = key1[::-1]

        df = DataFrame({"key1": key1, "key2": key2, "value1": np.random.randn(20000)})

        df2 = DataFrame({"key1": key1[::2], "key2": key2[::2], "value2": np.random.randn(10000)})

        # just to hit the label compression code path
        merged = merge(df, df2, how="outer")
Example #35
0
    def test_left_merge_na_buglet(self):
        left = DataFrame({'id': list('abcde'), 'v1': randn(5),
                          'v2': randn(5), 'dummy' : list('abcde'),
                          'v3' : randn(5)},
                         columns=['id', 'v1', 'v2', 'dummy', 'v3'])
        right = DataFrame({'id' : ['a', 'b', np.nan, np.nan, np.nan],
                           'sv3' : [1.234, 5.678, np.nan, np.nan, np.nan]})

        merged = merge(left, right, on='id', how='left')

        rdf = right.drop(['id'], axis=1)
        expected = left.join(rdf)
        tm.assert_frame_equal(merged, expected)
Example #36
0
    def test_left_merge_na_buglet(self):
        left = DataFrame({'id': list('abcde'), 'v1': randn(5),
                          'v2': randn(5), 'dummy' : list('abcde'),
                          'v3' : randn(5)},
                         columns=['id', 'v1', 'v2', 'dummy', 'v3'])
        right = DataFrame({'id' : ['a', 'b', np.nan, np.nan, np.nan],
                           'sv3' : [1.234, 5.678, np.nan, np.nan, np.nan]})

        merged = merge(left, right, on='id', how='left')

        rdf = right.drop(['id'], axis=1)
        expected = left.join(rdf)
        tm.assert_frame_equal(merged, expected)
Example #37
0
    def test_join_inner_multiindex(self):
        key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux',
               'qux', 'snap']
        key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two',
               'three', 'one']

        data = np.random.randn(len(key1))
        data = DataFrame({'key1': key1, 'key2': key2,
                         'data': data})

        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['first', 'second'])
        to_join = DataFrame(np.random.randn(10, 3), index=index,
                            columns=['j_one', 'j_two', 'j_three'])

        joined = data.join(to_join, on=['key1', 'key2'], how='inner')
        expected = merge(data, to_join.reset_index(),
                         left_on=['key1', 'key2'],
                         right_on=['first', 'second'], how='inner',
                         sort=False)

        expected2 = merge(to_join, data,
                          right_on=['key1', 'key2'], left_index=True,
                          how='inner', sort=False)
        assert_frame_equal(joined, expected2.reindex_like(joined))

        expected2 = merge(to_join, data, right_on=['key1', 'key2'],
                          left_index=True, how='inner', sort=False)

        expected = expected.drop(['first', 'second'], axis=1)
        expected.index = joined.index

        self.assert_(joined.index.is_monotonic)
        assert_frame_equal(joined, expected)
Example #38
0
    def test_compress_group_combinations(self):

        # ~ 40000000 possible unique groups
        key1 = np.array([rands(10) for _ in xrange(10000)], dtype='O')
        key1 = np.tile(key1, 2)
        key2 = key1[::-1]

        df = DataFrame({'key1' : key1, 'key2' : key2,
                        'value1' : np.random.randn(20000)})

        df2 = DataFrame({'key1' : key1[::2], 'key2' : key2[::2],
                         'value2' : np.random.randn(10000)})

        # just to hit the label compression code path
        merged = merge(df, df2, how='outer')
Example #39
0
    def test_intelligently_handle_join_key(self):
        # #733, be a bit more 1337 about not returning unconsolidated DataFrame

        left = DataFrame({'key' : [1, 1, 2, 2, 3],
                          'value' : range(5)}, columns=['value', 'key'])
        right = DataFrame({'key' : [1, 1, 2, 3, 4, 5],
                           'rvalue' : range(6)})

        joined = merge(left, right, on='key', how='outer')
        expected = DataFrame({'key' : [1, 1, 1, 1, 2, 2, 3, 4, 5.],
                              'value' : np.array([0, 0, 1, 1, 2, 3, 4,
                                                  np.nan, np.nan]),
                              'rvalue' : np.array([0, 1, 0, 1, 2, 2, 3, 4, 5])},
                             columns=['value', 'key', 'rvalue'])
        assert_frame_equal(joined, expected)

        self.assert_(joined._data.is_consolidated())
Example #40
0
    def test_intelligently_handle_join_key(self):
        # #733, be a bit more 1337 about not returning unconsolidated DataFrame

        left = DataFrame({'key' : [1, 1, 2, 2, 3],
                          'value' : range(5)}, columns=['value', 'key'])
        right = DataFrame({'key' : [1, 1, 2, 3, 4, 5],
                           'rvalue' : range(6)})

        joined = merge(left, right, on='key', how='outer')
        expected = DataFrame({'key' : [1, 1, 1, 1, 2, 2, 3, 4, 5.],
                              'value' : np.array([0, 0, 1, 1, 2, 3, 4,
                                                  np.nan, np.nan]),
                              'rvalue' : np.array([0, 1, 0, 1, 2, 2, 3, 4, 5])},
                             columns=['value', 'key', 'rvalue'])
        assert_frame_equal(joined, expected)

        self.assert_(joined._data.is_consolidated())
Example #41
0
    def test_intelligently_handle_join_key(self):
        # #733, be a bit more 1337 about not returning unconsolidated DataFrame

        left = DataFrame({"key": [1, 1, 2, 2, 3], "value": range(5)}, columns=["value", "key"])
        right = DataFrame({"key": [1, 1, 2, 3, 4, 5], "rvalue": range(6)})

        joined = merge(left, right, on="key", how="outer")
        expected = DataFrame(
            {
                "key": [1, 1, 1, 1, 2, 2, 3, 4, 5.0],
                "value": np.array([0, 0, 1, 1, 2, 3, 4, np.nan, np.nan]),
                "rvalue": np.array([0, 1, 0, 1, 2, 2, 3, 4, 5]),
            },
            columns=["value", "key", "rvalue"],
        )
        assert_frame_equal(joined, expected)

        self.assert_(joined._data.is_consolidated())
Example #42
0
    def test_left_join_index_preserve_order(self):

        left = DataFrame({"k1": [0, 1, 2] * 8, "k2": ["foo", "bar"] * 12, "v": np.arange(24)})

        index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")])
        right = DataFrame({"v2": [5, 7]}, index=index)

        result = left.join(right, on=["k1", "k2"])

        expected = left.copy()
        expected["v2"] = np.nan
        expected["v2"][(expected.k1 == 2) & (expected.k2 == "bar")] = 5
        expected["v2"][(expected.k1 == 1) & (expected.k2 == "foo")] = 7

        tm.assert_frame_equal(result, expected)

        # do a right join for an extra test
        joined = merge(right, left, left_index=True, right_on=["k1", "k2"], how="right")
        tm.assert_frame_equal(joined.ix[:, expected.columns], expected)
Example #43
0
    def test_left_join_index_preserve_order(self):

        left = DataFrame({'k1' : [0, 1, 2] * 8,
                          'k2' : ['foo', 'bar'] * 12,
                          'v' : np.arange(24)})

        index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')])
        right = DataFrame({'v2' : [5, 7]}, index=index)

        result = left.join(right, on=['k1', 'k2'])

        expected = left.copy()
        expected['v2'] = np.nan
        expected['v2'][(expected.k1 == 2) & (expected.k2 == 'bar')] = 5
        expected['v2'][(expected.k1 == 1) & (expected.k2 == 'foo')] = 7

        tm.assert_frame_equal(result, expected)

        # do a right join for an extra test
        joined = merge(right, left, left_index=True,
                       right_on=['k1', 'k2'], how='right')
        tm.assert_frame_equal(joined.ix[:, expected.columns], expected)
Example #44
0
 def load_indices(self, tickers, startdate, lags):
     self.tickers = tickers
     self.filename = "DATA.csv"
     self.startdate = startdate
     self.enddate = datetime.date.today().strftime("%Y%m%d")
     if os.path.isfile(self.filename):
         data = pan.DataFrame.from_csv(self.filename)
         self.dataframe = data
     else:
         for ticker in tickers:
             data = web.get_data_yahoo(ticker, self.startdate, self.enddate)
             index = ticker + '1change'
             data[index] = data['Adj Close'].pct_change(1)
             #remove unused columns and nan row
             data = data[[index]]
             data = data[1:]
             #filter out middle threshold noise
             #data = data[np.logical_or(data[index] >= threshold, data[index] <= -threshold)]
             #preprocess data
             data = data.apply(preprocess)
             #lag data
             for i in range(1, lags + 1):
                 label = ticker + "%dlag" % i
                 data[label] = data[index].shift(i)
             #remove rows used for change calculation
             data = data[lags + 1:]
             print data.head(10)
             if ticker == "%5EGSPC":
                 self.sp = data
             else:
                 self.sp = merge(self.sp,
                                 data,
                                 left_index=True,
                                 right_index=True)
         self.dataframe = self.sp
         self.dataframe.to_csv(self.filename)
Example #45
0
 def test_merge_overlap(self):
     merged = merge(self.left, self.left, on='key')
     exp_len = (self.left['key'].value_counts() ** 2).sum()
     self.assertEqual(len(merged), exp_len)
     self.assert_('v1_x' in merged)
     self.assert_('v1_y' in merged)
Example #46
0
 def test_merge_common(self):
     joined = merge(self.df, self.df2)
     exp = merge(self.df, self.df2, on=['key1', 'key2'])
     tm.assert_frame_equal(joined, exp)
Example #47
0
 def test_merge_common(self):
     joined = merge(self.df, self.df2)
     exp = merge(self.df, self.df2, on=['key1', 'key2'])
     tm.assert_frame_equal(joined, exp)
Example #48
0
 def test_merge_overlap(self):
     merged = merge(self.left, self.left, on='key')
     exp_len = (self.left['key'].value_counts()**2).sum()
     self.assertEqual(len(merged), exp_len)
     self.assert_('v1.x' in merged)
     self.assert_('v1.y' in merged)
Example #49
0
 def test_merge_overlap(self):
     merged = merge(self.left, self.left, on="key")
     exp_len = (self.left["key"].value_counts() ** 2).sum()
     self.assertEqual(len(merged), exp_len)
     self.assert_("v1_x" in merged)
     self.assert_("v1_y" in merged)
Example #50
0
    def test_handle_overlap(self):
        joined = merge(self.df, self.df2, on='key2',
                       suffixes=['.foo', '.bar'])

        self.assert_('key1.foo' in joined)
        self.assert_('key1.bar' in joined)
Example #51
0
    def test_handle_overlap(self):
        joined = merge(self.df, self.df2, on='key2', suffixes=['.foo', '.bar'])

        self.assertIn('key1.foo', joined)
        self.assertIn('key1.bar', joined)