Example #1
0
 def load_from_csv(self, tickers, index, fields=Fields.QUOTES, **kwargs):
     ''' Return a quote panel '''
     #TODO Replace adj_close with actual_close
     #TODO Add reindex methods, and start, end, delta parameters
     reverse = kwargs.get('reverse', False)
     verbose = kwargs.get('verbose', False)
     if self.connected['database']:
         symbols, markets = self.db.getTickersCodes(tickers)
     elif not symbols:
         self._logger.error('** No database neither informations provided')
         return None
     timestamps = du.getNYSEdays(index[0], index[-1], dt.timedelta(hours=16))
     csv = da.DataAccess('Yahoo')
     df = csv.get_data(timestamps, symbols.values(), fields, verbose=verbose)
     quotes_dict = dict()
     for ticker in tickers:
         j = 0
         quotes_dict[ticker] = dict()
         for field in fields:
             serie = df[j][symbols[ticker]].groupby(index.freq.rollforward).aggregate(np.mean)
             #TODO add a function parameter to decide what to do about it
             clean_serie = serie.fillna(method='pad')
             quotes_dict[ticker][field] = clean_serie
             j += 1
     if reverse:
         return Panel.from_dict(quotes_dict, intersect=True, orient='minor')
     return Panel.from_dict(quotes_dict, intersect=True)
Example #2
0
    def test_swaplevel_panel(self):
        panel = Panel({"ItemA": self.frame, "ItemB": self.frame * 2})

        result = panel.swaplevel(0, 1, axis="major")
        expected = panel.copy()
        expected.major_axis = expected.major_axis.swaplevel(0, 1)
        tm.assert_panel_equal(result, expected)
Example #3
0
    def test_swaplevel_panel(self):
        panel = Panel({'ItemA' : self.frame,
                       'ItemB' : self.frame * 2})

        result = panel.swaplevel(0, 1, axis='major')
        expected = panel.copy()
        expected.major_axis = expected.major_axis.swaplevel(0, 1)
 def execute(self, strategy):
     forecasts = {}
     for forecaster in self.forecasters:
         forecasts[forecaster.name] = forecaster(strategy).data
     forecasts = Panel(forecasts)
     mean_fcst = self.normalise(forecasts.mean(axis = 'items'))
     return Signal(mean_fcst, [-20, 20], forecasts)
Example #5
0
    def test_panel_join_many(self):
        tm.K = 10
        panel = tm.makePanel()
        tm.K = 4

        panels = [panel.ix[:2], panel.ix[2:6], panel.ix[6:]]

        joined = panels[0].join(panels[1:])
        tm.assert_panel_equal(joined, panel)

        panels = [panel.ix[:2, :-5], panel.ix[2:6, 2:], panel.ix[6:, 5:-7]]

        data_dict = {}
        for p in panels:
            data_dict.update(p.iteritems())

        joined = panels[0].join(panels[1:], how='inner')
        expected = Panel.from_dict(data_dict, intersect=True)
        tm.assert_panel_equal(joined, expected)

        joined = panels[0].join(panels[1:], how='outer')
        expected = Panel.from_dict(data_dict, intersect=False)
        tm.assert_panel_equal(joined, expected)

        # edge cases
        self.assertRaises(ValueError, panels[0].join, panels[1:],
                          how='outer', lsuffix='foo', rsuffix='bar')
        self.assertRaises(ValueError, panels[0].join, panels[1:],
                          how='right')
def create_data():
    """ create the pickle/msgpack data """

    data = {
        'A': [0., 1., 2., 3., np.nan],
        'B': [0, 1, 0, 1, 0],
        'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'],
        'D': date_range('1/1/2009', periods=5),
        'E': [0., 1, Timestamp('20100101'), 'foo', 2.]
    }

    index = dict(int=Index(np.arange(10)),
                 date=date_range('20130101', periods=10),
                 period=period_range('2013-01-01', freq='M', periods=10))

    mi = dict(reg2=MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
                                                      ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])),
                                          names=['first', 'second']))
    series = dict(float=Series(data['A']),
                  int=Series(data['B']),
                  mixed=Series(data['E']),
                  ts=TimeSeries(np.arange(10).astype(np.int64), index=date_range('20130101',periods=10)),
                  mi=Series(np.arange(5).astype(np.float64),
                            index=MultiIndex.from_tuples(tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])),
                                                         names=['one', 'two'])),
                  dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']),
                  cat=Series(Categorical(['foo', 'bar', 'baz'])),
                  per=Series([Period('2000Q1')] * 5))

    mixed_dup_df = DataFrame(data)
    mixed_dup_df.columns = list("ABCDA")
    frame = dict(float=DataFrame(dict(A=series['float'], B=series['float'] + 1)),
                 int=DataFrame(dict(A=series['int'], B=series['int'] + 1)),
                 mixed=DataFrame(dict([(k, data[k]) for k in ['A', 'B', 'C', 'D']])),
                 mi=DataFrame(dict(A=np.arange(5).astype(np.float64), B=np.arange(5).astype(np.int64)),
                              index=MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'baz'],
                                                                       ['one', 'two', 'one', 'two', 'three']])),
                                                           names=['first', 'second'])),
                 dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
                               columns=['A', 'B', 'A']),
                 cat_onecol=DataFrame(dict(A=Categorical(['foo', 'bar']))),
                 cat_and_float=DataFrame(dict(A=Categorical(['foo', 'bar', 'baz']),
                                              B=np.arange(3).astype(np.int64))),
                 mixed_dup=mixed_dup_df)

    mixed_dup_panel = Panel(dict(ItemA=frame['float'], ItemB=frame['int']))
    mixed_dup_panel.items = ['ItemA', 'ItemA']
    panel = dict(float=Panel(dict(ItemA=frame['float'], ItemB=frame['float'] + 1)),
                 dup=Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64),
                           items=['A', 'B', 'A']),
                 mixed_dup=mixed_dup_panel)

    return dict(series=series,
                frame=frame,
                panel=panel,
                index=index,
                mi=mi,
                sp_series=dict(float=_create_sp_series(),
                               ts=_create_sp_tsseries()),
                sp_frame=dict(float=_create_sp_frame()))
Example #7
0
    def test_resample_panel(self):
        rng = date_range("1/1/2000", "6/30/2000")
        n = len(rng)

        panel = Panel(
            np.random.randn(3, n, 5),
            items=["one", "two", "three"],
            major_axis=rng,
            minor_axis=["a", "b", "c", "d", "e"],
        )

        result = panel.resample("M", axis=1)

        def p_apply(panel, f):
            result = {}
            for item in panel.items:
                result[item] = f(panel[item])
            return Panel(result, items=panel.items)

        expected = p_apply(panel, lambda x: x.resample("M"))
        tm.assert_panel_equal(result, expected)

        panel2 = panel.swapaxes(1, 2)
        result = panel2.resample("M", axis=2)
        expected = p_apply(panel2, lambda x: x.resample("M", axis=1))
        tm.assert_panel_equal(result, expected)
def test_resample_panel():
    rng = date_range('1/1/2000', '6/30/2000')
    n = len(rng)

    with catch_warnings(record=True):
        simplefilter("ignore", FutureWarning)
        panel = Panel(np.random.randn(3, n, 5),
                      items=['one', 'two', 'three'],
                      major_axis=rng,
                      minor_axis=['a', 'b', 'c', 'd', 'e'])

        result = panel.resample('M', axis=1).mean()

        def p_apply(panel, f):
            result = {}
            for item in panel.items:
                result[item] = f(panel[item])
            return Panel(result, items=panel.items)

        expected = p_apply(panel, lambda x: x.resample('M').mean())
        tm.assert_panel_equal(result, expected)

        panel2 = panel.swapaxes(1, 2)
        result = panel2.resample('M', axis=2).mean()
        expected = p_apply(panel2,
                           lambda x: x.resample('M', axis=1).mean())
        tm.assert_panel_equal(result, expected)
Example #9
0
    def test_panel_setitem(self):

        with catch_warnings(record=True):
            # GH 7763
            # loc and setitem have setting differences
            np.random.seed(0)
            index = range(3)
            columns = list('abc')

            panel = Panel({'A': DataFrame(np.random.randn(3, 3),
                                          index=index, columns=columns),
                           'B': DataFrame(np.random.randn(3, 3),
                                          index=index, columns=columns),
                           'C': DataFrame(np.random.randn(3, 3),
                                          index=index, columns=columns)})

            replace = DataFrame(np.eye(3, 3), index=range(3), columns=columns)
            expected = Panel({'A': replace, 'B': replace, 'C': replace})

            p = panel.copy()
            for idx in list('ABC'):
                p[idx] = replace
            tm.assert_panel_equal(p, expected)

            p = panel.copy()
            for idx in list('ABC'):
                p.loc[idx, :, :] = replace
            tm.assert_panel_equal(p, expected)
 def execute(self, strategy):
     measures = {}
     for pars in self.par_pairs:
         name = "ewmac_{}_{}".format(max(pars), min(pars))
         ewmac = EWMAC(EMA(max(pars)), EMA(min(pars)), self.vol)
         measures[name] = ewmac(strategy).data
     measures = Panel(measures)
     return Signal(measures.mean(axis = 'items'), [-20, 20], measures)
Example #11
0
    def test_resample_panel_numpy(self):
        rng = date_range('1/1/2000', '6/30/2000')
        n = len(rng)

        panel = Panel(np.random.randn(3, n, 5),
                      items=['one', 'two', 'three'],
                      major_axis=rng,
                      minor_axis=['a', 'b', 'c', 'd', 'e'])

        result = panel.resample('M', how=lambda x: x.mean(), axis=1)
        expected = panel.resample('M', how='mean', axis=1)
        tm.assert_panel_equal(result, expected)
Example #12
0
    def test_sparse_panel(self):

        items = ["x", "y", "z"]
        p = Panel(dict((i, tm.makeDataFrame().ix[:2, :2]) for i in items))
        sp = p.to_sparse()

        self._check_roundtrip(sp, tm.assert_panel_equal, check_panel_type=True)

        sp2 = p.to_sparse(kind="integer")
        self._check_roundtrip(sp2, tm.assert_panel_equal, check_panel_type=True)

        sp3 = p.to_sparse(fill_value=0)
        self._check_roundtrip(sp3, tm.assert_panel_equal, check_panel_type=True)
def run(mpfile, **kwargs):
    from pandas import Panel, np

    meta_data = mpfile.document['_hdata'].pop('input')
    file_path = os.path.join(os.environ['HOME'], 'work', meta_data['file_path'])
    if not os.path.exists(file_path):
        print 'Please upload', file_path
        return

    table_columns = meta_data['table_columns'].split(' -- ')
    identifier = mpfile.ids[0]

    with tarfile.open(file_path, "r:gz") as tar:
        for member in tar.getmembers():
            name = os.path.splitext(member.name)[0]
            print 'load', name, '...'
            f = tar.extractfile(member)
            if 'pump' in name:
                #fstr = f.read()
                #fstr = ''.join([f.readline() for x in xrange(10)])
                # only load a small area
                list1, list2 = range(1), range(6)
                tuples = [(x, y) for x in list1 for y in list2]
                delta = 150
                for x, y in tuples:
                    lines = []
                    for i in xrange((x+1)*delta):
                        line = f.readline()
                        if i > x*delta:
                            lines.append(line)
                    sub_lines = []
                    for line in lines:
                        sub_line = line.strip().split(',')[y*delta:(y+1)*delta]
                        sub_lines.append(','.join(sub_line))
                    fstr = '\n'.join(sub_lines)
                    print 'read_csv ...'
                    df = read_csv(fstr, header=None)
                    arr = [[[cell] for cell in row] for row in df.values]
                    sub_name = '{}_{}_{}'.format(name, x, y)
                    df = Panel(arr, minor_axis=[sub_name]).transpose(2, 0, 1).to_frame()
                    print df.head()
                    print 'add', sub_name, '...'
                    mpfile.add_data_table(identifier, df, sub_name)
                    f.seek(0)
            else:
                fstr = f.read()
                df = read_csv(fstr, names=table_columns)
                print 'add', name, '...'
                mpfile.add_data_table(identifier, df, name)

    print 'Added data from {}'.format(file_path)
Example #14
0
    def test_resample_panel_numpy(self):
        rng = date_range("1/1/2000", "6/30/2000")
        n = len(rng)

        panel = Panel(
            np.random.randn(3, n, 5),
            items=["one", "two", "three"],
            major_axis=rng,
            minor_axis=["a", "b", "c", "d", "e"],
        )

        result = panel.resample("M", how=lambda x: x.mean(), axis=1)
        expected = panel.resample("M", how="mean", axis=1)
        tm.assert_panel_equal(result, expected)
Example #15
0
    def test_pipe_panel(self):
        with catch_warnings(record=True):
            wp = Panel({'r1': DataFrame({"A": [1, 2, 3]})})
            f = lambda x, y: x + y
            result = wp.pipe(f, 2)
            expected = wp + 2
            assert_panel_equal(result, expected)

            result = wp.pipe((f, 'y'), x=1)
            expected = wp + 1
            assert_panel_equal(result, expected)

            with pytest.raises(ValueError):
                result = wp.pipe((f, 'y'), x=1, y=1)
Example #16
0
    def test_sparse_panel(self):

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            items = ["x", "y", "z"]
            p = Panel(dict((i, tm.makeDataFrame().ix[:2, :2]) for i in items))
            sp = p.to_sparse()

            self._check_roundtrip(sp, tm.assert_panel_equal, check_panel_type=True)

            sp2 = p.to_sparse(kind="integer")
            self._check_roundtrip(sp2, tm.assert_panel_equal, check_panel_type=True)

            sp3 = p.to_sparse(fill_value=0)
            self._check_roundtrip(sp3, tm.assert_panel_equal, check_panel_type=True)
Example #17
0
    def test_sparse_panel(self):
        items = ['x', 'y', 'z']
        p = Panel(dict((i, tm.makeDataFrame()) for i in items))
        sp = p.to_sparse()

        self._check_roundtrip(sp, tm.assert_panel_equal,
                              check_panel_type=True)

        sp2 = p.to_sparse(kind='integer')
        self._check_roundtrip(sp2, tm.assert_panel_equal,
                              check_panel_type=True)

        sp3 = p.to_sparse(fill_value=0)
        self._check_roundtrip(sp3, tm.assert_panel_equal,
                              check_panel_type=True)
Example #18
0
class PanelMethods(object):

    params = ['items', 'major', 'minor']
    param_names = ['axis']

    def setup(self, axis):
        with warnings.catch_warnings(record=True):
            self.panel = Panel(np.random.randn(100, 1000, 100))

    def time_pct_change(self, axis):
        with warnings.catch_warnings(record=True):
            self.panel.pct_change(1, axis=axis)

    def time_shift(self, axis):
        with warnings.catch_warnings(record=True):
            self.panel.shift(1, axis=axis)
Example #19
0
def rolling_corr_pairwise(df, window, min_periods=None):
    """
    Computes pairwise rolling correlation matrices as Panel whose items are
    dates

    Parameters
    ----------
    df : DataFrame
    window : int
    min_periods : int, default None

    Returns
    -------
    correls : Panel
    """
    from pandas import Panel
    from collections import defaultdict

    all_results = defaultdict(dict)

    for i, k1 in enumerate(df.columns):
        for k2 in df.columns[i:]:
            corr = rolling_corr(df[k1], df[k2], window,
                                min_periods=min_periods)
            all_results[k1][k2] = corr
            all_results[k2][k1] = corr

    return Panel.from_dict(all_results).swapaxes('items', 'major')
Example #20
0
def load_secoora_ncs(run_name):
    """
    Loads local files using the run_name date.
    NOTE: Consider moving this inside the notebook.
    """
    fname = '{}-{}.nc'.format
    OBS_DATA = nc2df(os.path.join(run_name,
                                  fname(run_name, 'OBS_DATA')))
    SECOORA_OBS_DATA = nc2df(os.path.join(run_name,
                                          fname(run_name, 'SECOORA_OBS_DATA')))

    ALL_OBS_DATA = concat([OBS_DATA, SECOORA_OBS_DATA], axis=1)
    index = ALL_OBS_DATA.index

    dfs = dict(OBS_DATA=ALL_OBS_DATA)
    for fname in glob(os.path.join(run_name, "*.nc")):
        if 'OBS_DATA' in fname:
            continue
        else:
            model = fname.split('.')[0].split('-')[-1]
            df = nc2df(fname)
            # FIXME: Horrible work around duplicate times.
            if len(df.index.values) != len(np.unique(df.index.values)):
                kw = dict(subset='index', keep='last')
                df = df.reset_index().drop_duplicates(**kw).set_index('index')
            kw = dict(method='time', limit=30)
            df = df.reindex(index).interpolate(**kw).ix[index]
            dfs.update({model: df})

    return Panel.fromDict(dfs).swapaxes(0, 2)
 def __init__(self, tickers, start_date, end_date):
     '''
     Constructor
     '''
     self.start = start_date
     self.end = end_date
     self.downloader = Data.Handler("/home/mark/Data/MarketData/Stocks/Python/")
     self.instruments = Panel({ticker:DataFrame(None) for ticker in sorted(tickers)})
Example #22
0
    def test_panel_assignment(self):

        with catch_warnings(record=True):
            # GH3777
            wp = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'],
                       major_axis=date_range('1/1/2000', periods=5),
                       minor_axis=['A', 'B', 'C', 'D'])
            wp2 = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'],
                        major_axis=date_range('1/1/2000', periods=5),
                        minor_axis=['A', 'B', 'C', 'D'])

            # TODO: unused?
            # expected = wp.loc[['Item1', 'Item2'], :, ['A', 'B']]

            with pytest.raises(NotImplementedError):
                wp.loc[['Item1', 'Item2'], :, ['A', 'B']] = wp2.loc[
                    ['Item1', 'Item2'], :, ['A', 'B']]
Example #23
0
    def test_big_table(self):
        raise nose.SkipTest('no big table')

        # create and write a big table
        wp = Panel(np.random.randn(20, 1000, 1000), items= [ 'Item%s' % i for i in xrange(20) ],
                   major_axis=date_range('1/1/2000', periods=1000), minor_axis = [ 'E%s' % i for i in xrange(1000) ])

        wp.ix[:,100:200,300:400] = np.nan

        try:
            store = HDFStore(self.scratchpath)
            store._debug_memory = True
            store.append('wp',wp)
            recons = store.select('wp')
        finally:
            store.close()
            os.remove(self.scratchpath)
Example #24
0
    def test_panel_setitem_with_multiindex(self):

        with catch_warnings(record=True):
            # 10360
            # failing with a multi-index
            arr = np.array([[[1, 2, 3], [0, 0, 0]],
                            [[0, 0, 0], [0, 0, 0]]],
                           dtype=np.float64)

            # reg index
            axes = dict(items=['A', 'B'], major_axis=[0, 1],
                        minor_axis=['X', 'Y', 'Z'])
            p1 = Panel(0., **axes)
            p1.iloc[0, 0, :] = [1, 2, 3]
            expected = Panel(arr, **axes)
            tm.assert_panel_equal(p1, expected)

            # multi-indexes
            axes['items'] = MultiIndex.from_tuples(
                [('A', 'a'), ('B', 'b')])
            p2 = Panel(0., **axes)
            p2.iloc[0, 0, :] = [1, 2, 3]
            expected = Panel(arr, **axes)
            tm.assert_panel_equal(p2, expected)

            axes['major_axis'] = MultiIndex.from_tuples(
                [('A', 1), ('A', 2)])
            p3 = Panel(0., **axes)
            p3.iloc[0, 0, :] = [1, 2, 3]
            expected = Panel(arr, **axes)
            tm.assert_panel_equal(p3, expected)

            axes['minor_axis'] = MultiIndex.from_product(
                [['X'], range(3)])
            p4 = Panel(0., **axes)
            p4.iloc[0, 0, :] = [1, 2, 3]
            expected = Panel(arr, **axes)
            tm.assert_panel_equal(p4, expected)

            arr = np.array(
                [[[1, 0, 0], [2, 0, 0]], [[0, 0, 0], [0, 0, 0]]],
                dtype=np.float64)
            p5 = Panel(0., **axes)
            p5.iloc[0, :, 0] = [1, 2]
            expected = Panel(arr, **axes)
            tm.assert_panel_equal(p5, expected)
Example #25
0
    def as_dataframe(self):
        """
        Creates a dataframe object for a shapefile's main layer using layer_as_dataframe. This object is cached on disk for
        layer use, but the cached copy will only be picked up if the shapefile's mtime is older than the dataframe's mtime.

        :return: either a pandas DataFrame object if there is but one raster band or a Panel if there are N.
        """

        dfx_path = self.get_filename('dfx')
        tiff_path = self.get_filename('tif')
        if hasattr(self, '_df'):
            return self._df

        elif os.path.exists(dfx_path) and os.stat(dfx_path).st_mtime >= os.stat(tiff_path).st_mtime:
            self._df = Panel.read_pickle(dfx_path)
            return self._df
        else:
            ds = gdal.Open(tiff_path)
            try:
                df= Panel(ds.ReadAsArray())
                df.to_pickle(dfx_path)
                self._df = df
                return self._df
            except:
                df = DataFrame(ds.ReadAsArray())
                df.to_pickle(dfx_path)
                self._df = df
                return self._df
Example #26
0
 def apply(self, func, *args, **kwargs):
     result = {}
     for key, df in self.obj.iteritems():
         grp = DataFrameGroupBy(df, grouper=self.grouper)
         if not callable(func):
             f = getattr(grp, func)
             res = f(*args, **kwargs)
         result[key] = res
     return Panel.from_dict(result)
def test_panel_aggregation():
    ind = pd.date_range('1/1/2000', periods=100)
    data = np.random.randn(2, len(ind), 4)

    wp = Panel(data, items=['Item1', 'Item2'], major_axis=ind,
               minor_axis=['A', 'B', 'C', 'D'])

    tg = TimeGrouper('M', axis=1)
    _, grouper, _ = tg._get_grouper(wp)
    bingrouped = wp.groupby(grouper)
    binagg = bingrouped.mean()

    def f(x):
        assert (isinstance(x, Panel))
        return x.mean(1)

    result = bingrouped.agg(f)
    tm.assert_panel_equal(result, binagg)
Example #28
0
        def _check(frame):
            dense_frame = frame.to_dense()

            wp = Panel.from_dict({"foo": frame})
            from_dense_lp = wp.to_frame()

            from_sparse_lp = spf.stack_sparse_frame(frame)

            self.assert_(np.array_equal(from_dense_lp.values, from_sparse_lp.values))
Example #29
0
    def testFamaMacBethRolling(self):
        # self.checkFamaMacBethExtended('rolling', self.panel_x, self.panel_y,
        #                               nw_lags_beta=2)

        # df = DataFrame(np.random.randn(50, 10))
        x = dict((k, DataFrame(np.random.randn(50, 10))) for k in "abcdefg")
        x = Panel.from_dict(x)
        y = DataFrame(np.random.randn(50, 10)) + DataFrame(0.01 * np.random.randn(50, 10))
        self.checkFamaMacBethExtended("rolling", x, y, nw_lags_beta=2)
        self.checkFamaMacBethExtended("expanding", x, y, nw_lags_beta=2)
Example #30
0
        def _check(frame):
            dense_frame = frame.to_dense()  # noqa

            wp = Panel.from_dict({'foo': frame})
            from_dense_lp = wp.to_frame()

            from_sparse_lp = spf.stack_sparse_frame(frame)

            self.assert_numpy_array_equal(from_dense_lp.values,
                                          from_sparse_lp.values)
Example #31
0
def create_data():
    """ create the pickle/msgpack data """

    data = {
        u'A': [0., 1., 2., 3., np.nan],
        u'B': [0, 1, 0, 1, 0],
        u'C': [u'foo1', u'foo2', u'foo3', u'foo4', u'foo5'],
        u'D': date_range('1/1/2009', periods=5),
        u'E': [0., 1, Timestamp('20100101'), u'foo', 2.]
    }

    scalars = dict(timestamp=Timestamp('20130101'), period=Period('2012', 'M'))

    index = dict(int=Index(np.arange(10)),
                 date=date_range('20130101', periods=10),
                 period=period_range('2013-01-01', freq='M', periods=10),
                 float=Index(np.arange(10, dtype=np.float64)),
                 uint=Index(np.arange(10, dtype=np.uint64)),
                 timedelta=timedelta_range('00:00:00', freq='30T', periods=10))

    if _loose_version >= LooseVersion('0.18'):
        from pandas import RangeIndex
        index['range'] = RangeIndex(10)

    if _loose_version >= LooseVersion('0.21'):
        from pandas import interval_range
        index['interval'] = interval_range(0, periods=10)

    mi = dict(reg2=MultiIndex.from_tuples(tuple(
        zip(*[[u'bar', u'bar', u'baz', u'baz', u'foo', u'foo', u'qux', u'qux'],
              [u'one', u'two', u'one', u'two', u'one', u'two', u'one', u'two']
              ])),
                                          names=[u'first', u'second']))

    series = dict(
        float=Series(data[u'A']),
        int=Series(data[u'B']),
        mixed=Series(data[u'E']),
        ts=Series(np.arange(10).astype(np.int64),
                  index=date_range('20130101', periods=10)),
        mi=Series(np.arange(5).astype(np.float64),
                  index=MultiIndex.from_tuples(tuple(
                      zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])),
                                               names=[u'one', u'two'])),
        dup=Series(np.arange(5).astype(np.float64),
                   index=[u'A', u'B', u'C', u'D', u'A']),
        cat=Series(Categorical([u'foo', u'bar', u'baz'])),
        dt=Series(date_range('20130101', periods=5)),
        dt_tz=Series(date_range('20130101', periods=5, tz='US/Eastern')),
        period=Series([Period('2000Q1')] * 5))

    mixed_dup_df = DataFrame(data)
    mixed_dup_df.columns = list(u"ABCDA")
    frame = dict(
        float=DataFrame({
            u'A': series[u'float'],
            u'B': series[u'float'] + 1
        }),
        int=DataFrame({
            u'A': series[u'int'],
            u'B': series[u'int'] + 1
        }),
        mixed=DataFrame({k: data[k]
                         for k in [u'A', u'B', u'C', u'D']}),
        mi=DataFrame(
            {
                u'A': np.arange(5).astype(np.float64),
                u'B': np.arange(5).astype(np.int64)
            },
            index=MultiIndex.from_tuples(tuple(
                zip(*[[u'bar', u'bar', u'baz', u'baz', u'baz'],
                      [u'one', u'two', u'one', u'two', u'three']])),
                                         names=[u'first', u'second'])),
        dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64),
                      columns=[u'A', u'B', u'A']),
        cat_onecol=DataFrame({u'A': Categorical([u'foo', u'bar'])}),
        cat_and_float=DataFrame({
            u'A': Categorical([u'foo', u'bar', u'baz']),
            u'B': np.arange(3).astype(np.int64)
        }),
        mixed_dup=mixed_dup_df,
        dt_mixed_tzs=DataFrame(
            {
                u'A': Timestamp('20130102', tz='US/Eastern'),
                u'B': Timestamp('20130603', tz='CET')
            },
            index=range(5)),
        dt_mixed2_tzs=DataFrame(
            {
                u'A': Timestamp('20130102', tz='US/Eastern'),
                u'B': Timestamp('20130603', tz='CET'),
                u'C': Timestamp('20130603', tz='UTC')
            },
            index=range(5)))

    with catch_warnings(record=True):
        mixed_dup_panel = Panel({
            u'ItemA': frame[u'float'],
            u'ItemB': frame[u'int']
        })
        mixed_dup_panel.items = [u'ItemA', u'ItemA']
        panel = dict(float=Panel({
            u'ItemA': frame[u'float'],
            u'ItemB': frame[u'float'] + 1
        }),
                     dup=Panel(np.arange(30).reshape(3, 5,
                                                     2).astype(np.float64),
                               items=[u'A', u'B', u'A']),
                     mixed_dup=mixed_dup_panel)

    cat = dict(int8=Categorical(list('abcdefg')),
               int16=Categorical(np.arange(1000)),
               int32=Categorical(np.arange(10000)))

    timestamp = dict(normal=Timestamp('2011-01-01'),
                     nat=NaT,
                     tz=Timestamp('2011-01-01', tz='US/Eastern'))

    if _loose_version < LooseVersion('0.19.2'):
        timestamp['freq'] = Timestamp('2011-01-01', offset='D')
        timestamp['both'] = Timestamp('2011-01-01',
                                      tz='Asia/Tokyo',
                                      offset='M')
    else:
        timestamp['freq'] = Timestamp('2011-01-01', freq='D')
        timestamp['both'] = Timestamp('2011-01-01', tz='Asia/Tokyo', freq='M')

    off = {
        'DateOffset': DateOffset(years=1),
        'DateOffset_h_ns': DateOffset(hour=6, nanoseconds=5824),
        'BusinessDay': BusinessDay(offset=timedelta(seconds=9)),
        'BusinessHour': BusinessHour(normalize=True, n=6, end='15:14'),
        'CustomBusinessDay': CustomBusinessDay(weekmask='Mon Fri'),
        'SemiMonthBegin': SemiMonthBegin(day_of_month=9),
        'SemiMonthEnd': SemiMonthEnd(day_of_month=24),
        'MonthBegin': MonthBegin(1),
        'MonthEnd': MonthEnd(1),
        'QuarterBegin': QuarterBegin(1),
        'QuarterEnd': QuarterEnd(1),
        'Day': Day(1),
        'YearBegin': YearBegin(1),
        'YearEnd': YearEnd(1),
        'Week': Week(1),
        'Week_Tues': Week(2, normalize=False, weekday=1),
        'WeekOfMonth': WeekOfMonth(week=3, weekday=4),
        'LastWeekOfMonth': LastWeekOfMonth(n=1, weekday=3),
        'FY5253': FY5253(n=2, weekday=6, startingMonth=7, variation="last"),
        'Easter': Easter(),
        'Hour': Hour(1),
        'Minute': Minute(1)
    }

    return dict(series=series,
                frame=frame,
                panel=panel,
                index=index,
                scalars=scalars,
                mi=mi,
                sp_series=dict(float=_create_sp_series(),
                               ts=_create_sp_tsseries()),
                sp_frame=dict(float=_create_sp_frame()),
                cat=cat,
                timestamp=timestamp,
                offsets=off)
Example #32
0
 def test_dense_to_sparse(self):
     wp = Panel.from_dict(self.data_dict)
     dwp = wp.to_sparse()
     tm.assertIsInstance(dwp['ItemA']['A'], SparseSeries)
Example #33
0
def WQXtoPandas(
        xmlLocation,
        charDict,
        outputPath='.',
        fromFile=False,
        outputDirName='Processed-Sites',
        RUN_PHREEQC=False,
        PHREEQC_PATH='/home/mcoving/phreeqc-2.18.0/bin/',
        DATABASE_FILE='/home/mcoving/phreeqc-2.18.0/database/phreeqc.dat',
        LOG_FILE='Result.log',
        START_FILE=None,
        splittag='',
        bracket_charge_balance=False):
    """
    Processes a WQX xml data file and loads data for each site in the WQX file into Pandas data objects that are stored in directories for each site.

    Parameters
    ----------
    xmlLocation : string
       Content depends on mode in which WQXtoPandas is run. When fromFile is set to False (input methods 2 or 3 in excel file) this string contains the html for a query to the USGS NWIS database to obtain an xml file of the desired data.  Alternatively, if fromFile is True (input method 1 in excel file) then this string contains the name of the xml file from which to read the data.

    charDict : dict
       A dictionary containing information about the characteristics to be processed.  Keys are EPA SRS characteristic names. Each entry in the dictionary is a second dictionary that contains keys IsRequired, pcode, fraction, and quality. These entries tell WQXtoPandas whether a given characteristic is required in order to process a sample, and whether a specific pcode, fraction, or quality should be required.  See excel example file for more details.

    outputPath : string
       path to directory that will contain output directory

    fromFile : boolean
       True if data will be read from an xml file already present on computer.  False if xml file should be queried from NWIS. (Default=False)

    outputDirName : string
       Name of output directory where all site data will be written out. (Default='Processed-Sites')

    RUN_PHREEQC : boolean
       Set to true if samples should be processed through PHREEQC. (Default=False)
    PHREEQC_PATH : string
       Path to PHREEQC executable (folder only, not executable file name)

    DATABASE_FILE : string
       Path to database file that PHREEQC should use, including database file name.
    LOG_FILE : string
       Name of log file that WQXtoPandas will create. (Default='Result.log')

    START_FILE : string
       Name of xls start file that was used to run this instance of WQXtoPandas. Name will be written out in log file.

    bracket_charge_balance : bool
       If set to true, WQXtoPandas will alternately force charge balance on calcium and alkalinity, while the latter is not physically meaningful, this provides a useful estimate of uncertainty for cases with high charge balance errors.  This is most useful for water that is very dilute or with high organic content, such that titrated alkalinity values are artificially high.

    Returns
    -------

    Returns 0 if execution successful.  Returns -1 in case of error.

    Notes
    -----

    Designed to be run through convenience function runWQXtoPandas().
    """
    try:
        #Check to see if output directory exists
        absOutputDirPath = os.path.abspath(outputPath)
        sitesdir = os.path.join(absOutputDirPath, outputDirName)
        print("sitesdir", sitesdir)
        if not (os.path.exists(sitesdir)):
            try:
                os.makedirs(sitesdir)
            except os.error:
                print((
                    "Problem creating output directory. Check output path name: "
                    + outputPath))
                return -1
        #create xml tree
        if fromFile:
            #read from file
            wqxtree = etree.ElementTree(file=xmlLocation)
        else:
            #check whether we already have a matching xml file
            xmlSaveFile = LOG_FILE + splittag + '.xml'
            if (os.path.isfile(xmlSaveFile)):
                goodAnswer = False
                while not (goodAnswer):
                    answer = input(
                        "An xml file (" + xmlSaveFile +
                        ") already exists.  \n Use this instead of html query (y or n)?"
                    )
                    if (answer.startswith('y')):
                        #read from file
                        wqxtree = etree.ElementTree(file=xmlSaveFile)
                        goodAnswer = True
                        queryXML = False
                    elif (answer.startswith('n')):
                        goodAnswer = True
                        queryXML = True
            else:
                queryXML = True
            #If we don't have a matching xml file, or we want to obtain a new one, then get the new xml
            if (queryXML):
                print("Obtaining xml file from USGS NWIS using html query...")
                #parse from html query
                print("XML query string: ", xmlLocation)
                r = requests.get(xmlLocation)
                if not r.ok:
                    #There is some problem with the xml query
                    print("Response: ", str(r))
                    print("Reason: ", r.reason)
                    print("Warning: ", r.headers['Warning'])
                #write to xml file
                try:
                    #write xml to file
                    xmlFile = open(xmlSaveFile, 'w')
                    print(r.text, file=xmlFile)
                    xmlFile.close()
                    wqxtree = etree.ElementTree(file=xmlSaveFile)
                except IOError:
                    print(
                        ("Problem writing to xml file to store html query: " +
                         xmlSaveFile))
                    return -1
        #begin parsing XML tree
        root = wqxtree.getroot()
        #get namespace map
        NSMAP = root.nsmap
        WQX = "{%s}" % NSMAP[None]
        #iterate over all <Activity> tags within file and process each sample
        samples_processed = []
        samples_not_processed = []
        sitesDict = {}
        sitesMetaDict = {}
        for activity in wqxtree.getiterator(tag=WQX + "Activity"):
            processThisSample = True
            reason = ''
            description = activity.find(WQX + "ActivityDescription")
            if (description != None):
                datetext = description.findtext(WQX + "ActivityStartDate")
                starttime = description.find(WQX + "ActivityStartTime")
                if (starttime != None):
                    timetext = starttime.findtext(WQX + "Time")
                    timezone = starttime.findtext(WQX + "TimeZoneCode")
                else:
                    timetext = ''
                    timezone = ''
                location = description.findtext(WQX +
                                                "MonitoringLocationIdentifier")
                descriptionDict = {
                    'location': location,
                    'date': datetext,
                    'time': timetext,
                    'timezone': timezone
                }
            else:
                descriptionDict = None
                processThisSample = False
                reason = 'No description'
            print(('Processing sample from ' + location + ' on ' + datetext))
            #create null sample dict
            sampleDict = {}
            sampleMetaDict = {}
            #iterate though all results for this activity
            for result in activity.getiterator(tag=WQX + 'Result'):
                if (processThisSample):
                    try:
                        resultdesc = result.find(WQX + "ResultDescription")
                        characteristic = resultdesc.findtext(
                            WQX + "CharacteristicName")
                        if (characteristic in charDict):
                            samplefraction = resultdesc.findtext(
                                WQX + "ResultSampleFractionText")
                            pcode = resultdesc.findtext(WQX + "USGSPCode")
                            quality = resultdesc.findtext(
                                WQX + "ResultStatusIdentifier")
                            measure = resultdesc.find(WQX + "ResultMeasure")
                            count = 1.0
                            if not (measure == None):
                                value = measure.findtext(WQX +
                                                         "ResultMeasureValue")
                                units = measure.findtext(WQX +
                                                         "MeasureUnitCode")
                                #split pcode into list
                                tempPcodeList = charDict[characteristic][
                                    'pcode'].split(';')
                                #                            print("tempPcodeList="+str(tempPcodeList))
                                pcodeDict = {}
                                for codePriority, code in enumerate(
                                        tempPcodeList):
                                    code = code.strip()
                                    if code != '':
                                        pcodeDict[code] = codePriority
                                #Check whether characteristic meets criteria
                                #for inclusion, otherwise don't add to sampleDict
                                addCharacteristic = True
                                if (charDict[characteristic]['fraction'] !=
                                        '0'):
                                    #test for correct fraction
                                    if (charDict[characteristic]['fraction'] !=
                                            samplefraction):
                                        addCharacteristic = False
                                if (addCharacteristic):
                                    if (charDict[characteristic]['pcode'] !=
                                            '0'):
                                        #test for correct pcode
                                        #                                        print("pcode = "+pcode)
                                        #                                        print("pcodeList = "+str(pcodeList))
                                        #                                        print("pcode in list="+str(pcode in pcodeList))
                                        if not (pcode in pcodeDict):
                                            addCharacteristic = False
                                if (addCharacteristic):
                                    if (charDict[characteristic]['quality'] !=
                                            '0'):
                                        #test for correct data quality
                                        if (charDict[characteristic]['quality']
                                                != quality):
                                            addCharacteristic = False
                                #end of characteristic criteria check
                                #Process duplicate characteristics
                                if (addCharacteristic):
                                    if (characteristic in sampleDict):
                                        priorPcode = sampleMetaDict[
                                            characteristic]['pcode']
                                        #if there are already multiple pcodes get only first one
                                        priorPcode = priorPcode.split(';')[0]
                                        averageValue = False
                                        if (len(pcodeDict) > 1):
                                            thisPcodePriority = pcodeDict[
                                                pcode]
                                            priorPcodePriority = \
                                                pcodeDict[priorPcode]
                                            if (thisPcodePriority >\
                                                    priorPcodePriority):
                                                #previous characteristic remains
                                                addCharacteristic = False
                                            elif (thisPcodePriority ==\
                                                  priorPcodePriority):
                                                averageValue = True
                                        else:
                                            averageValue = True
                                        if averageValue:
                                            #average this value with existing values
                                            count = \
                                                sampleMetaDict[characteristic]['count']
                                            count += 1.
                                            oldvalue = float(\
                                                sampleDict[characteristic])
                                            newvalue = (oldvalue * (count - 1.)\
                                                            + float(value))/count
                                            value = str(newvalue)
                                            pcode = priorPcode + '; ' + pcode
                                            priorUnits = \
                                                sampleMetaDict[characteristic]['units']
                                            units = priorUnits + '; ' + units

                                if (addCharacteristic):
                                    sampleDict[characteristic] = value
                                    sampleMetaDict[characteristic] = {
                                        'samplefraction': samplefraction,
                                        'units': units,
                                        'pcode': pcode,
                                        'quality': quality,
                                        'count': count
                                    }
                    #end results loop
                    except etree.XMLSyntaxError as detail:
                        print("File contains invalid XML syntax: ", detail)
                        processThisSample = False
                        reason = "Entry contains invalid XML syntax."
            #check whether sample has all the required constituents


#            print "Checking for requirements."
            if (processThisSample):
                for characteristic in charDict.keys():
                    if (charDict[characteristic]['IsRequired'] != '0'):
                        if not (characteristic in sampleDict):
                            processThisSample = False
                            reason += characteristic + ' not available. '
            if (processThisSample):
                #check to see whether site directory exists, if not, create it
                sampledir = os.path.join(sitesdir, location)
                if not (os.path.exists(sampledir)):
                    try:
                        os.makedirs(sampledir)
                    except os.error:
                        print(("Problem creating location directory: " +
                               sampledir))
                        processThisSample = False
                        reason = "Problem creating location directory: " + sampledir

            if (processThisSample):
                #Pull daily discharge data from USGS website
                good_discharge_value = False
                num_Q_tries = 0
                #Try 5 times to retrieve discharge value
                while (not good_discharge_value) and num_Q_tries <= 5:
                    dischargeDict = GetDailyDischarge(
                        location, datetext
                    )  #currently hard-wired to pcode 00060 (daily discharge, cfs)
                    if dischargeDict != -1:
                        good_discharge_value = True
                    else:
                        num_Q_tries += 1
                        dischargeDict = None
                if (dischargeDict != None):
                    sampleDict['Stream flow, mean. daily'] = dischargeDict[
                        'discharge']
                    sampleMetaDict['Stream flow, mean. daily'] = {
                        'units': 'cfs',
                        'pcode': '00060',
                        'quality': dischargeDict['quality'],
                        'count': 1,
                        'samplefraction': None
                    }
                    descriptionDict['name'] = dischargeDict['name']
                else:
                    #Possibly allow this sample to be thrown out if no mean daily discharge, and/or similar for instantaneous discharge
                    sampleDict['Stream flow, mean. daily'] = None
                    sampleMetaDict['Stream flow, mean. daily'] = {
                        'units': 'cfs',
                        'pcode': '00060',
                        'quality': None,
                        'count': 1,
                        'samplefraction': None
                    }
                # Create data frame row for this sample date
                if descriptionDict['time'] != '':
                    rowdate = to_datetime(datetext + ' ' +
                                          descriptionDict['time'])
                else:
                    rowdate = to_datetime(datetext)
                #sampleRow = DataFrame(sampleDict, index=[rowdate], dtype='float')
                #Create Panel to contain sample meta data
                samplePanelRow = Panel({
                    'data':
                    DataFrame(sampleDict, index=[rowdate], dtype='float'),
                    'time':
                    DataFrame(descriptionDict['time'],
                              index=[rowdate],
                              columns=list(sampleMetaDict.keys())),
                    'timezone':
                    DataFrame(descriptionDict['timezone'],
                              index=[rowdate],
                              columns=list(sampleMetaDict.keys())),
                    'pcode':
                    DataFrame(
                        [extractValues(sampleMetaDict, ['pcode'])['values']],
                        index=[rowdate],
                        columns=list(sampleMetaDict.keys())),
                    'quality':
                    DataFrame(
                        [extractValues(sampleMetaDict, ['quality'])['values']],
                        index=[rowdate],
                        columns=list(sampleMetaDict.keys())),
                    'fraction':
                    DataFrame([
                        extractValues(sampleMetaDict,
                                      ['samplefraction'])['values']
                    ],
                              index=[rowdate],
                              columns=list(sampleMetaDict.keys())),
                    'units':
                    DataFrame(
                        [extractValues(sampleMetaDict, ['units'])['values']],
                        index=[rowdate],
                        columns=list(sampleMetaDict.keys())),
                    'count':
                    DataFrame(
                        [extractValues(sampleMetaDict, ['count'])['values']],
                        index=[rowdate],
                        columns=list(sampleMetaDict.keys())),
                })
                #sampleMetaRow = Series(sampleMetaDict, index=[to_datetime(datetext)], dtype='object')
                #Previous solution was reading/writing from pickle files
                #New solution will keep all data in memory until end.
                #This could cause memory problems with large data sets

                #Test whether a df for this location already exists
                if location in sitesDict:
                    #                    tempDF = sitesDict[location]
                    #                    sitesDict[location] = tempDF.append(sampleRow)
                    tempPanel = sitesDict[location]
                    sitesDict[location] = concat([tempPanel, samplePanelRow],
                                                 axis=1)
                else:
                    sitesDict[location] = samplePanelRow
            #add one to number of samples processed
            if (processThisSample):
                samples_processed.append(location + ' ' + datetext)
            else:
                samples_not_processed.append(location + ' ' + datetext +
                                             ' - ' + reason)
        print(('Number of Samples Processed = ' + str(len(samples_processed))))
        print(('Number of Samples Not Processed = ' +
               str(len(samples_not_processed))))

        #Write out individual site data pickle and csv files in each site directory
        print('Writing out site data files...')
        for location, pnl in sitesDict.items():
            print(location)
            pickleFile = os.path.join(sitesdir, location,
                                      location + '-Panel.pkl')
            pickle.dump(pnl, open(pickleFile, 'wb'))
            pnl.to_excel(pickleFile[:-3] + 'xls')
            #Retrieve and store site description metadata
            siteDescriptionDataDF = GetSiteData(location)
            siteDescriptionDataFileName = os.path.join(
                sitesdir, location, location + '-Site-Description.pkl')
            pickle.dump(siteDescriptionDataDF,
                        open(siteDescriptionDataFileName, 'wb'))
            siteDescriptionDataDF.to_csv(siteDescriptionDataFileName[:-3] +
                                         'csv')
        #Process sites through PHREEQC
        if RUN_PHREEQC:
            print("Processing site water chemisty data in PHREEQC...")
            for location, pnl in sitesDict.items():
                phreeqc_df = processPanel(pnl,
                                          os.path.join(sitesdir, location),
                                          PHREEQC_PATH, DATABASE_FILE)
                phreeqc_site_file = os.path.join(sitesdir, location,
                                                 location + '-PHREEQC.pkl')
                try:
                    pickle.dump(phreeqc_df, open(phreeqc_site_file, 'wb'))
                    phreeqc_df.to_csv(phreeqc_site_file[:-3] + 'csv')
                except IOError:
                    print('Problem writing out PHREEQC data file.')
            if bracket_charge_balance:
                for location, pnl in sitesDict.items():
                    #Force balance on Calcium
                    phreeqc_df_ca = processPanel(pnl,
                                                 os.path.join(
                                                     sitesdir, location),
                                                 PHREEQC_PATH,
                                                 DATABASE_FILE,
                                                 force_balance='Ca')
                    phreeqc_site_file_ca = os.path.join(
                        sitesdir, location, location + '-PHREEQC-Ca.pkl')
                    try:
                        pickle.dump(phreeqc_df_ca,
                                    open(phreeqc_site_file_ca, 'wb'))
                        phreeqc_df_ca.to_csv(phreeqc_site_file_ca[:-3] + 'csv')
                    except IOError:
                        print('Problem writing out PHREEQC Ca data file.')
                    #Force balance on Alkalinity
                    phreeqc_df_alk = processPanel(pnl,
                                                  os.path.join(
                                                      sitesdir, location),
                                                  PHREEQC_PATH,
                                                  DATABASE_FILE,
                                                  force_balance='Alk')
                    phreeqc_site_file_alk = os.path.join(
                        sitesdir, location, location + '-PHREEQC-Alk.pkl')
                    try:
                        pickle.dump(phreeqc_df_alk,
                                    open(phreeqc_site_file_alk, 'wb'))
                        phreeqc_df_alk.to_csv(phreeqc_site_file_alk[:-3] +
                                              'csv')
                    except IOError:
                        print('Problem writing out PHREEQC Alk data file.')
        #Create log file
        print(('Writing log file: ' + LOG_FILE + splittag))
        try:
            log_file = open(LOG_FILE + splittag, 'w')
            print('Start file = ' + START_FILE, file=log_file)
            print('Number of Samples Processed = ' +
                  str(len(samples_processed)),
                  file=log_file)
            print('Number of Samples Not Processed = ' +
                  str(len(samples_not_processed)),
                  file=log_file)
            print("###############", file=log_file)
            print("Characteristics", file=log_file)
            print("###############", file=log_file)
            printColumnNames = True
            for key, flags in charDict.items():
                if (printColumnNames):
                    names = ['characteristic']  # + '\t'
                    for column in flags.keys():
                        names.append(str(column))
                    print(str("\t".join(names)), file=log_file)
                    printColumnNames = False
                columns = [key]
                for column in flags.keys():
                    if isinstance(flags[column], str):
                        columns.append(flags[column])
                print(str("\t".join(columns)), file=log_file)
            print("###############", file=log_file)
            print("Samples processed", file=log_file)
            print("###############", file=log_file)
            for line in samples_processed:
                print(line, file=log_file)
            print("###############", file=log_file)
            print("Samples not processed", file=log_file)
            print("###############", file=log_file)
            for line in samples_not_processed:
                print(line, file=log_file)
        except IOError:
            print(("Problem opening log file: " + LOG_FILE))
            return -1
    #exceptions for parsing of xml file
    except IOError:
        print("Error opening xml file. Does it exist?")
        #Note: can throw this error when discharge values are not read correctly,
        #I should fix this, 6/16/2014
    except etree.XMLSyntaxError as detail:
        print("File contains invalid XML syntax: ", detail)
    except requests.exceptions.RequestException as detail:
        print("Error retrieving data by xml query: ", detail)
    return 0
Example #34
0
 def make_source(self):
     return Panel(self.raw_data).tz_localize('UTC', axis=1)
Example #35
0
    def test_partial_setting(self):

        # GH2578, allow ix and friends to partially set

        # series
        s_orig = Series([1, 2, 3])

        s = s_orig.copy()
        s[5] = 5
        expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5])
        tm.assert_series_equal(s, expected)

        s = s_orig.copy()
        s.loc[5] = 5
        expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5])
        tm.assert_series_equal(s, expected)

        s = s_orig.copy()
        s[5] = 5.
        expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5])
        tm.assert_series_equal(s, expected)

        s = s_orig.copy()
        s.loc[5] = 5.
        expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5])
        tm.assert_series_equal(s, expected)

        # iloc/iat raise
        s = s_orig.copy()

        def f():
            s.iloc[3] = 5.

        pytest.raises(IndexError, f)

        def f():
            s.iat[3] = 5.

        pytest.raises(IndexError, f)

        # ## frame ##

        df_orig = DataFrame(
            np.arange(6).reshape(3, 2), columns=['A', 'B'], dtype='int64')

        # iloc/iat raise
        df = df_orig.copy()

        def f():
            df.iloc[4, 2] = 5.

        pytest.raises(IndexError, f)

        def f():
            df.iat[4, 2] = 5.

        pytest.raises(IndexError, f)

        # row setting where it exists
        expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]}))
        df = df_orig.copy()
        df.iloc[1] = df.iloc[2]
        tm.assert_frame_equal(df, expected)

        expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]}))
        df = df_orig.copy()
        df.loc[1] = df.loc[2]
        tm.assert_frame_equal(df, expected)

        # like 2578, partial setting with dtype preservation
        expected = DataFrame(dict({'A': [0, 2, 4, 4], 'B': [1, 3, 5, 5]}))
        df = df_orig.copy()
        df.loc[3] = df.loc[2]
        tm.assert_frame_equal(df, expected)

        # single dtype frame, overwrite
        expected = DataFrame(dict({'A': [0, 2, 4], 'B': [0, 2, 4]}))
        df = df_orig.copy()
        with catch_warnings(record=True):
            df.ix[:, 'B'] = df.ix[:, 'A']
        tm.assert_frame_equal(df, expected)

        # mixed dtype frame, overwrite
        expected = DataFrame(dict({'A': [0, 2, 4], 'B': Series([0, 2, 4])}))
        df = df_orig.copy()
        df['B'] = df['B'].astype(np.float64)
        with catch_warnings(record=True):
            df.ix[:, 'B'] = df.ix[:, 'A']
        tm.assert_frame_equal(df, expected)

        # single dtype frame, partial setting
        expected = df_orig.copy()
        expected['C'] = df['A']
        df = df_orig.copy()
        with catch_warnings(record=True):
            df.ix[:, 'C'] = df.ix[:, 'A']
        tm.assert_frame_equal(df, expected)

        # mixed frame, partial setting
        expected = df_orig.copy()
        expected['C'] = df['A']
        df = df_orig.copy()
        with catch_warnings(record=True):
            df.ix[:, 'C'] = df.ix[:, 'A']
        tm.assert_frame_equal(df, expected)

        with catch_warnings(record=True):
            # ## panel ##
            p_orig = Panel(np.arange(16).reshape(2, 4, 2),
                           items=['Item1', 'Item2'],
                           major_axis=pd.date_range('2001/1/12', periods=4),
                           minor_axis=['A', 'B'], dtype='float64')

            # panel setting via item
            p_orig = Panel(np.arange(16).reshape(2, 4, 2),
                           items=['Item1', 'Item2'],
                           major_axis=pd.date_range('2001/1/12', periods=4),
                           minor_axis=['A', 'B'], dtype='float64')
            expected = p_orig.copy()
            expected['Item3'] = expected['Item1']
            p = p_orig.copy()
            p.loc['Item3'] = p['Item1']
            tm.assert_panel_equal(p, expected)

            # panel with aligned series
            expected = p_orig.copy()
            expected = expected.transpose(2, 1, 0)
            expected['C'] = DataFrame({'Item1': [30, 30, 30, 30],
                                       'Item2': [32, 32, 32, 32]},
                                      index=p_orig.major_axis)
            expected = expected.transpose(2, 1, 0)
            p = p_orig.copy()
            p.loc[:, :, 'C'] = Series([30, 32], index=p_orig.items)
            tm.assert_panel_equal(p, expected)

        # GH 8473
        dates = date_range('1/1/2000', periods=8)
        df_orig = DataFrame(np.random.randn(8, 4), index=dates,
                            columns=['A', 'B', 'C', 'D'])

        expected = pd.concat([df_orig,
                              DataFrame({'A': 7}, index=[dates[-1] + 1])],
                             sort=True)
        df = df_orig.copy()
        df.loc[dates[-1] + 1, 'A'] = 7
        tm.assert_frame_equal(df, expected)
        df = df_orig.copy()
        df.at[dates[-1] + 1, 'A'] = 7
        tm.assert_frame_equal(df, expected)

        exp_other = DataFrame({0: 7}, index=[dates[-1] + 1])
        expected = pd.concat([df_orig, exp_other], axis=1)

        df = df_orig.copy()
        df.loc[dates[-1] + 1, 0] = 7
        tm.assert_frame_equal(df, expected)
        df = df_orig.copy()
        df.at[dates[-1] + 1, 0] = 7
        tm.assert_frame_equal(df, expected)
Example #36
0
    def test_sample(sel):
        # Fixes issue: 2419
        # additional specific object based tests

        # A few dataframe test with degenerate weights.
        easy_weight_list = [0] * 10
        easy_weight_list[5] = 1

        df = pd.DataFrame({
            'col1': range(10, 20),
            'col2': range(20, 30),
            'colString': ['a'] * 10,
            'easyweights': easy_weight_list
        })
        sample1 = df.sample(n=1, weights='easyweights')
        assert_frame_equal(sample1, df.iloc[5:6])

        # Ensure proper error if string given as weight for Series, panel, or
        # DataFrame with axis = 1.
        s = Series(range(10))
        with pytest.raises(ValueError):
            s.sample(n=3, weights='weight_column')

        with catch_warnings(record=True):
            panel = Panel(items=[0, 1, 2],
                          major_axis=[2, 3, 4],
                          minor_axis=[3, 4, 5])
            with pytest.raises(ValueError):
                panel.sample(n=1, weights='weight_column')

        with pytest.raises(ValueError):
            df.sample(n=1, weights='weight_column', axis=1)

        # Check weighting key error
        with pytest.raises(KeyError):
            df.sample(n=3, weights='not_a_real_column_name')

        # Check that re-normalizes weights that don't sum to one.
        weights_less_than_1 = [0] * 10
        weights_less_than_1[0] = 0.5
        tm.assert_frame_equal(df.sample(n=1, weights=weights_less_than_1),
                              df.iloc[:1])

        ###
        # Test axis argument
        ###

        # Test axis argument
        df = pd.DataFrame({'col1': range(10), 'col2': ['a'] * 10})
        second_column_weight = [0, 1]
        assert_frame_equal(
            df.sample(n=1, axis=1, weights=second_column_weight), df[['col2']])

        # Different axis arg types
        assert_frame_equal(
            df.sample(n=1, axis='columns', weights=second_column_weight),
            df[['col2']])

        weight = [0] * 10
        weight[5] = 0.5
        assert_frame_equal(df.sample(n=1, axis='rows', weights=weight),
                           df.iloc[5:6])
        assert_frame_equal(df.sample(n=1, axis='index', weights=weight),
                           df.iloc[5:6])

        # Check out of range axis values
        with pytest.raises(ValueError):
            df.sample(n=1, axis=2)

        with pytest.raises(ValueError):
            df.sample(n=1, axis='not_a_name')

        with pytest.raises(ValueError):
            s = pd.Series(range(10))
            s.sample(n=1, axis=1)

        # Test weight length compared to correct axis
        with pytest.raises(ValueError):
            df.sample(n=1, axis=1, weights=[0.5] * 10)

        # Check weights with axis = 1
        easy_weight_list = [0] * 3
        easy_weight_list[2] = 1

        df = pd.DataFrame({
            'col1': range(10, 20),
            'col2': range(20, 30),
            'colString': ['a'] * 10
        })
        sample1 = df.sample(n=1, axis=1, weights=easy_weight_list)
        assert_frame_equal(sample1, df[['colString']])

        # Test default axes
        with catch_warnings(record=True):
            p = Panel(items=['a', 'b', 'c'],
                      major_axis=[2, 4, 6],
                      minor_axis=[1, 3, 5])
            assert_panel_equal(p.sample(n=3, random_state=42),
                               p.sample(n=3, axis=1, random_state=42))
            assert_frame_equal(df.sample(n=3, random_state=42),
                               df.sample(n=3, axis=0, random_state=42))

        # Test that function aligns weights with frame
        df = DataFrame({
            'col1': [5, 6, 7],
            'col2': ['a', 'b', 'c'],
        },
                       index=[9, 5, 3])
        s = Series([1, 0, 0], index=[3, 5, 9])
        assert_frame_equal(df.loc[[3]], df.sample(1, weights=s))

        # Weights have index values to be dropped because not in
        # sampled DataFrame
        s2 = Series([0.001, 0, 10000], index=[3, 5, 10])
        assert_frame_equal(df.loc[[3]], df.sample(1, weights=s2))

        # Weights have empty values to be filed with zeros
        s3 = Series([0.01, 0], index=[3, 5])
        assert_frame_equal(df.loc[[3]], df.sample(1, weights=s3))

        # No overlap in weight and sampled DataFrame indices
        s4 = Series([1, 0], index=[1, 2])
        with pytest.raises(ValueError):
            df.sample(1, weights=s4)
Example #37
0
    def setUp(self):

        self.series_ints = Series(np.random.rand(4), index=lrange(0, 8, 2))
        self.frame_ints = DataFrame(np.random.randn(4, 4),
                                    index=lrange(0, 8, 2),
                                    columns=lrange(0, 12, 3))
        with catch_warnings(record=True):
            self.panel_ints = Panel(np.random.rand(4, 4, 4),
                                    items=lrange(0, 8, 2),
                                    major_axis=lrange(0, 12, 3),
                                    minor_axis=lrange(0, 16, 4))

        self.series_uints = Series(np.random.rand(4),
                                   index=UInt64Index(lrange(0, 8, 2)))
        self.frame_uints = DataFrame(np.random.randn(4, 4),
                                     index=UInt64Index(lrange(0, 8, 2)),
                                     columns=UInt64Index(lrange(0, 12, 3)))
        with catch_warnings(record=True):
            self.panel_uints = Panel(np.random.rand(4, 4, 4),
                                     items=UInt64Index(lrange(0, 8, 2)),
                                     major_axis=UInt64Index(lrange(0, 12, 3)),
                                     minor_axis=UInt64Index(lrange(0, 16, 4)))

        self.series_labels = Series(np.random.randn(4), index=list('abcd'))
        self.frame_labels = DataFrame(np.random.randn(4, 4),
                                      index=list('abcd'),
                                      columns=list('ABCD'))
        with catch_warnings(record=True):
            self.panel_labels = Panel(np.random.randn(4, 4, 4),
                                      items=list('abcd'),
                                      major_axis=list('ABCD'),
                                      minor_axis=list('ZYXW'))

        self.series_mixed = Series(np.random.randn(4), index=[2, 4, 'null', 8])
        self.frame_mixed = DataFrame(np.random.randn(4, 4),
                                     index=[2, 4, 'null', 8])
        with catch_warnings(record=True):
            self.panel_mixed = Panel(np.random.randn(4, 4, 4),
                                     items=[2, 4, 'null', 8])

        self.series_ts = Series(np.random.randn(4),
                                index=date_range('20130101', periods=4))
        self.frame_ts = DataFrame(np.random.randn(4, 4),
                                  index=date_range('20130101', periods=4))
        with catch_warnings(record=True):
            self.panel_ts = Panel(np.random.randn(4, 4, 4),
                                  items=date_range('20130101', periods=4))

        dates_rev = (date_range('20130101',
                                periods=4).sort_values(ascending=False))
        self.series_ts_rev = Series(np.random.randn(4), index=dates_rev)
        self.frame_ts_rev = DataFrame(np.random.randn(4, 4), index=dates_rev)
        with catch_warnings(record=True):
            self.panel_ts_rev = Panel(np.random.randn(4, 4, 4),
                                      items=dates_rev)

        self.frame_empty = DataFrame({})
        self.series_empty = Series({})
        with catch_warnings(record=True):
            self.panel_empty = Panel({})

        # form agglomerates
        for o in self._objs:

            d = dict()
            for t in self._typs:
                d[t] = getattr(self, '%s_%s' % (o, t), None)

            setattr(self, o, d)
Example #38
0
 def __init__(self, autoAdjust=True):
    
     self.startDate = (2008,1,1)
     self.autoAdjust=autoAdjust
     self.wp = Panel()
Example #39
0
 def time_from_dict(self):
     with warnings.catch_warnings(record=True):
         Panel.from_dict(self.data_frames)
Example #40
0
class HistData(object):
    ''' a class for working with yahoo finance data '''
    def __init__(self, autoAdjust=True):
       
        self.startDate = (2008,1,1)
        self.autoAdjust=autoAdjust
        self.wp = Panel()
        
        
    def load(self,dataFile):
        """load data from HDF"""
        if os.path.exists(dataFile):
            store = HDFStore(dataFile)
            symbols = [str(s).strip('/') for s in list(store.keys()) ]   
            data = dict(list(zip(symbols,[store[symbol] for symbol in symbols])))
            self.wp = Panel(data)
            store.close()
        else:
            raise IOError('Data file does not exist')
            
        
    def save(self,dataFile):
        """ save data to HDF"""
        print(('Saving data to', dataFile))
        store = HDFStore(dataFile)
        for symbol in self.wp.items:
            store[symbol] = self.wp[symbol]
            
        store.close()
                    
            
            
    def downloadData(self,symbols='all'):
        ''' get data from yahoo  '''
        
        if symbols == 'all':
            symbols = self.symbols
        
        #store = HDFStore(self.dataFile)        
        p = ProgressBar(len(symbols))
        
        for idx,symbol in enumerate(symbols):
            
            try:            
                df = getSymbolData(symbol,sDate=self.startDate,verbose=False)
                if self.autoAdjust:
                    df =  _adjust(df,removeOrig=True)
                
                if len(self.symbols)==0:
                    self.wp = Panel({symbol:df})
                else:
                    self.wp[symbol] = df
            
            except Exception as e:
                print(e) 
            p.animate(idx+1)
    
    def getDataFrame(self,field='close'):
        ''' return a slice on wide panel for a given field '''
        return self.wp.minor_xs(field)
         
    
    @property
    def symbols(self):
        return self.wp.items.tolist()        
           
  
    def __repr__(self):
        return str(self.wp)
Example #41
0
    def setup_method(self, method):

        self.series_ints = Series(np.random.rand(4), index=lrange(0, 8, 2))
        self.frame_ints = DataFrame(np.random.randn(4, 4),
                                    index=lrange(0, 8, 2),
                                    columns=lrange(0, 12, 3))
        with catch_warnings(record=True):
            self.panel_ints = Panel(np.random.rand(4, 4, 4),
                                    items=lrange(0, 8, 2),
                                    major_axis=lrange(0, 12, 3),
                                    minor_axis=lrange(0, 16, 4))

        self.series_uints = Series(np.random.rand(4),
                                   index=UInt64Index(lrange(0, 8, 2)))
        self.frame_uints = DataFrame(np.random.randn(4, 4),
                                     index=UInt64Index(lrange(0, 8, 2)),
                                     columns=UInt64Index(lrange(0, 12, 3)))
        with catch_warnings(record=True):
            self.panel_uints = Panel(np.random.rand(4, 4, 4),
                                     items=UInt64Index(lrange(0, 8, 2)),
                                     major_axis=UInt64Index(lrange(0, 12, 3)),
                                     minor_axis=UInt64Index(lrange(0, 16, 4)))

        self.series_floats = Series(np.random.rand(4),
                                    index=Float64Index(range(0, 8, 2)))
        self.frame_floats = DataFrame(np.random.randn(4, 4),
                                      index=Float64Index(range(0, 8, 2)),
                                      columns=Float64Index(range(0, 12, 3)))
        with catch_warnings(record=True):
            self.panel_floats = Panel(np.random.rand(4, 4, 4),
                                      items=Float64Index(range(0, 8, 2)),
                                      major_axis=Float64Index(range(0, 12, 3)),
                                      minor_axis=Float64Index(range(0, 16, 4)))

        m_idces = [
            MultiIndex.from_product([[1, 2], [3, 4]]),
            MultiIndex.from_product([[5, 6], [7, 8]]),
            MultiIndex.from_product([[9, 10], [11, 12]])
        ]

        self.series_multi = Series(np.random.rand(4), index=m_idces[0])
        self.frame_multi = DataFrame(np.random.randn(4, 4),
                                     index=m_idces[0],
                                     columns=m_idces[1])
        with catch_warnings(record=True):
            self.panel_multi = Panel(np.random.rand(4, 4, 4),
                                     items=m_idces[0],
                                     major_axis=m_idces[1],
                                     minor_axis=m_idces[2])

        self.series_labels = Series(np.random.randn(4), index=list('abcd'))
        self.frame_labels = DataFrame(np.random.randn(4, 4),
                                      index=list('abcd'),
                                      columns=list('ABCD'))
        with catch_warnings(record=True):
            self.panel_labels = Panel(np.random.randn(4, 4, 4),
                                      items=list('abcd'),
                                      major_axis=list('ABCD'),
                                      minor_axis=list('ZYXW'))

        self.series_mixed = Series(np.random.randn(4), index=[2, 4, 'null', 8])
        self.frame_mixed = DataFrame(np.random.randn(4, 4),
                                     index=[2, 4, 'null', 8])
        with catch_warnings(record=True):
            self.panel_mixed = Panel(np.random.randn(4, 4, 4),
                                     items=[2, 4, 'null', 8])

        self.series_ts = Series(np.random.randn(4),
                                index=date_range('20130101', periods=4))
        self.frame_ts = DataFrame(np.random.randn(4, 4),
                                  index=date_range('20130101', periods=4))
        with catch_warnings(record=True):
            self.panel_ts = Panel(np.random.randn(4, 4, 4),
                                  items=date_range('20130101', periods=4))

        dates_rev = (date_range('20130101',
                                periods=4).sort_values(ascending=False))
        self.series_ts_rev = Series(np.random.randn(4), index=dates_rev)
        self.frame_ts_rev = DataFrame(np.random.randn(4, 4), index=dates_rev)
        with catch_warnings(record=True):
            self.panel_ts_rev = Panel(np.random.randn(4, 4, 4),
                                      items=dates_rev)

        self.frame_empty = DataFrame({})
        self.series_empty = Series({})
        with catch_warnings(record=True):
            self.panel_empty = Panel({})

        # form agglomerates
        for o in self._objs:

            d = dict()
            for t in self._typs:
                d[t] = getattr(self, '%s_%s' % (o, t), None)

            setattr(self, o, d)
Example #42
0
 def p_apply(panel, f):
     result = {}
     for item in panel.items:
         result[item] = f(panel[item])
     return Panel(result, items=panel.items)
Example #43
0
 def test_to_dense(self):
     dwp = self.panel.to_dense()
     dwp2 = Panel.from_dict(self.data_dict)
     tm.assert_panel_equal(dwp, dwp2)