Example #1
0
    def _parse(cls, body):
        matched = re.search(r'<div class="col_r" style="">(.*?)</div>', body, re.MULTILINE | re.DOTALL | re.UNICODE)
        if matched is None or len(matched.groups()) == 0:
            raise ValueError("no matched data found.")

        lines = matched.group(1).strip().split("\n")

        value_pattern = re.compile(r">(.*?)<", re.UNICODE)
        data_array = []
        stock_name = cls._get_stock_name(body)
        for line in lines:
            if r"<tr" not in line:
                continue

            data = []
            line = line.strip()
            for value in re.findall(value_pattern, line):
                value = cls._normalize(value)
                if isinstance(value, string_types) and len(value) == 0:
                    continue
                data.append(value)
            if len(data) > 0:
                data_array.append(data)

        if data_array:
            data_array.insert(0, [stock_name] * len(data_array[0]))
            data_array = np.array(data_array).T
        df = DataFrame(data_array, columns=NETEASE_STOCK_INFO_COLUMNS)
        df.set_index("date", inplace=True)
        return df
Example #2
0
def test_resample_timegrouper():
    # GH 7227
    dates1 = [datetime(2014, 10, 1), datetime(2014, 9, 3),
              datetime(2014, 11, 5), datetime(2014, 9, 5),
              datetime(2014, 10, 8), datetime(2014, 7, 15)]

    dates2 = dates1[:2] + [pd.NaT] + dates1[2:4] + [pd.NaT] + dates1[4:]
    dates3 = [pd.NaT] + dates1 + [pd.NaT]

    for dates in [dates1, dates2, dates3]:
        df = DataFrame(dict(A=dates, B=np.arange(len(dates))))
        result = df.set_index('A').resample('M').count()
        exp_idx = pd.DatetimeIndex(['2014-07-31', '2014-08-31',
                                    '2014-09-30',
                                    '2014-10-31', '2014-11-30'],
                                   freq='M', name='A')
        expected = DataFrame({'B': [1, 0, 2, 2, 1]}, index=exp_idx)
        assert_frame_equal(result, expected)

        result = df.groupby(pd.Grouper(freq='M', key='A')).count()
        assert_frame_equal(result, expected)

        df = DataFrame(dict(A=dates, B=np.arange(len(dates)), C=np.arange(
            len(dates))))
        result = df.set_index('A').resample('M').count()
        expected = DataFrame({'B': [1, 0, 2, 2, 1], 'C': [1, 0, 2, 2, 1]},
                             index=exp_idx, columns=['B', 'C'])
        assert_frame_equal(result, expected)

        result = df.groupby(pd.Grouper(freq='M', key='A')).count()
        assert_frame_equal(result, expected)
Example #3
0
def proximity(features, pos_columns=['x', 'y']):
    """Find the distance to each feature's nearest neighbor.

    Parameters
    ----------
    features : DataFrame
    pos_columns : list of column names
        ['x', 'y'] by default

    Returns
    -------
    proximity : DataFrame
        distance to each particle's nearest neighbor,
        indexed by particle if 'particle' column is present in input

    Example
    -------
    Find the proximity of each particle to its nearest neighbor in every frame.

    >>> prox = t.groupby('frame').apply(proximity).reset_index()
    >>> avg_prox = prox.groupby('particle')['proximity'].mean()

    And filter the trajectories...

    >>> particle_nos = avg_prox[avg_prox > 20].index
    >>> t_filtered = t[t['particle'].isin(particle_nos)]
    """
    leaf_size = max(1, int(np.round(np.log10(len(features)))))
    tree = cKDTree(features[['x', 'y']].copy(), leaf_size)
    proximity = tree.query(tree.data, 2)[0][:, 1]
    result = DataFrame({'proximity': proximity})
    if 'particle' in features:
        result.set_index(features['particle'], inplace=True)
    return result
Example #4
0
    def test_construction_with_categorical_index(self):

        ci = tm.makeCategoricalIndex(10)

        # with Categorical
        df = DataFrame({'A': np.random.randn(10),
                        'B': ci.values})
        idf = df.set_index('B')
        str(idf)
        tm.assert_index_equal(idf.index, ci, check_names=False)
        assert idf.index.name == 'B'

        # from a CategoricalIndex
        df = DataFrame({'A': np.random.randn(10),
                        'B': ci})
        idf = df.set_index('B')
        str(idf)
        tm.assert_index_equal(idf.index, ci, check_names=False)
        assert idf.index.name == 'B'

        idf = df.set_index('B').reset_index().set_index('B')
        str(idf)
        tm.assert_index_equal(idf.index, ci, check_names=False)
        assert idf.index.name == 'B'

        new_df = idf.reset_index()
        new_df.index = df.B
        tm.assert_index_equal(new_df.index, ci, check_names=False)
        assert idf.index.name == 'B'
Example #5
0
def _create_df(sheet, start_row, start_col, end_row, end_col, reindex=False):
    df = DataFrame(sheet[start_row+1:end_row, start_col:end_col].value,
                   columns=sheet[start_row, start_col:end_col].value)

    if reindex:
        df.set_index(keys=df.iloc[:, 0], inplace=True)
    return df
Example #6
0
def stats(request):
    stats_by = request.GET.get('by', 'category')

    trx = Transaction.objects.filter(amount__lt=0).exclude(category__name='Credit Card Payments')
    original_df = DataFrame(data=[{k: getattr(t, k) for k in ('date', 'category', 'amount')} for t in trx])

    df = original_df.set_index('date').groupby('category').resample('M', how='sum')

    chart_df = df.reset_index()\
                 .pivot_table(values='amount', index=['date'], columns=['category'], aggfunc=numpy.sum)\
                 .replace(numpy.NaN, 0)

    months = [x.strftime('%Y-%m-%d') for x in chart_df.index]
    chart_series = [
        {'name': category, 'type': 'column', 'data': [abs(float(a)) for a in amounts]}
        for category,amounts in chart_df.iteritems()]

    table_df = df.reset_index()\
                 .pivot_table(values='amount', index=['category'], columns=['date'], aggfunc=numpy.sum)\
                 .replace(numpy.NaN, 0)#.reset_index()
    table_data = [(category, list(amounts)) for category,amounts in chart_df.iteritems()]
    total_df = original_df.set_index('date').resample('M', how='sum').transpose()
    table_data.append(('Total', total_df.values[0]))

    return render_to_response('transactions/stats.html', {
        'months_json': json.dumps(months),
        'chart_series_json': json.dumps(chart_series),
        'chart_df': chart_df,
        'months': months,
        'table_data': table_data,
    })
Example #7
0
    def test_dti_set_index_reindex(self):
        # GH 6631
        df = DataFrame(np.random.random(6))
        idx1 = date_range('2011/01/01', periods=6, freq='M', tz='US/Eastern')
        idx2 = date_range('2013', periods=6, freq='A', tz='Asia/Tokyo')

        df = df.set_index(idx1)
        tm.assert_index_equal(df.index, idx1)
        df = df.reindex(idx2)
        tm.assert_index_equal(df.index, idx2)

        # 11314
        # with tz
        index = date_range(datetime(2015, 10, 1),
                           datetime(2015, 10, 1, 23),
                           freq='H', tz='US/Eastern')
        df = DataFrame(np.random.randn(24, 1), columns=['a'], index=index)
        new_index = date_range(datetime(2015, 10, 2),
                               datetime(2015, 10, 2, 23),
                               freq='H', tz='US/Eastern')

        # TODO: unused?
        result = df.set_index(new_index)  # noqa

        assert new_index.freq == index.freq
Example #8
0
    def test_reset_index_multiindex_nan(self):
        # GH6322, testing reset_index on MultiIndexes
        # when we have a nan or all nan
        df = DataFrame({'A': ['a', 'b', 'c'],
                        'B': [0, 1, np.nan],
                        'C': np.random.rand(3)})
        rs = df.set_index(['A', 'B']).reset_index()
        tm.assert_frame_equal(rs, df)

        df = DataFrame({'A': [np.nan, 'b', 'c'],
                        'B': [0, 1, 2],
                        'C': np.random.rand(3)})
        rs = df.set_index(['A', 'B']).reset_index()
        tm.assert_frame_equal(rs, df)

        df = DataFrame({'A': ['a', 'b', 'c'],
                        'B': [0, 1, 2],
                        'C': [np.nan, 1.1, 2.2]})
        rs = df.set_index(['A', 'B']).reset_index()
        tm.assert_frame_equal(rs, df)

        df = DataFrame({'A': ['a', 'b', 'c'],
                        'B': [np.nan, np.nan, np.nan],
                        'C': np.random.rand(3)})
        rs = df.set_index(['A', 'B']).reset_index()
        tm.assert_frame_equal(rs, df)
Example #9
0
    def load_frame(cls, session):
        """
        Load part of the table into a well-formatted pandas.DataFrame.

        session can be any object with the execute method.
        """
        sample = cls.__table__
        job = Job.__table__
        result = Result.__table__
        analysis = AnalysisConfiguration.__table__
        control = ControlConfiguration.__table__
        experiment = Experiment.__table__
        stmt = select([sample.c.id, sample.c.control,
                result.c.point, control.c.type, control.c.direction,
                experiment.c.strain, job.c.preparation, job.c.sampling,
                job.c.projection, job.c.measure, job.c.delay,
                analysis.c.version]).where(and_(
                sample.c.result_id == result.c.id,
                result.c.job_id == job.c.id,
                job.c.analysis_id == analysis.c.id,
                job.c.control_id == control.c.id,
                job.c.experiment_id == experiment.c.id))
        query = session.execute(stmt)
        df = DataFrame(iter(query), columns=query.keys())
        df.set_index("id", inplace=True)
        return df
Example #10
0
    def calculate_top_10_solutions(self):
        '''calcualte all schemes and select top 10 solutions'''
        
        columns = ['name','rate','money']

        if isfile( learning_progres_csv ):
            scheme_profit = read_csv(learning_progres_csv)
        else:
            scheme_profit = DataFrame(columns = columns)            
        scheme_profit.set_index('name',inplace = True)

        with open(learning_progres_csv, 'w+') as csvfile:
            writer = csv.DictWriter(csvfile,delimiter=',',fieldnames = columns)
            writer.writeheader()
            csvfile.flush()
            for sc in self.generate_all_schemes():
                if sc.name not in scheme_profit.index:
                    e = evaluator(sc)
                    scheme_profit.ix[sc.name] = rate,money = e.calculate()
                    writer.writerow({'name':sc.name,'rate':rate,'money':money})
                    csvfile.flush()
                    if self.log:
                        print(sc.name + ' - ' + str(money) + ' \t rate = ' + str(rate))
                else:
                    writer.writerow({'name':sc.name,'rate':scheme_profit.rate[sc.name],'money':scheme_profit.money[sc.name]})
                    if self.log:
                        print(sc.name + ' - ' + str(scheme_profit.money[sc.name]) + ' \t rate = ' + str(scheme_profit.rate[sc.name]))
                    csvfile.flush()

        #TODO:write into scheme
        scheme_profit.sort(['money'],ascending = False)
        return scheme_profit[:10].to_dict()
Example #11
0
    def get_data(stock, start = None, end = None, interval='d'):
        params = dict(s=stock)
        format = "%Y-%m-%d"
        if start is not None:
            date = datetime.datetime.strptime(start, format)
            params['a'] = date.month - 1
            params['b'] = date.day
            params['c'] = date.year

        if end is not None:
            date = datetime.datetime.strptime(end, format)
            params['d'] = date.month - 1
            params['e'] = date.day
            params['f'] = date.year


        params['g'] = interval

        response = requests.get(YahooAPI.base_url, params=params)
        content = response.content.split('\n')
        headers = content[0].split(',')
        lines = [line.split(',') for line in content[1:-1]]  # last line empty
        import pdb
        pdb.set_trace()
        df = DataFrame(lines, columns=headers)
        df['Date'] = pd.to_datetime(df['Date'], format=format)
        df.set_index('Date', inplace = True)
        return df
Example #12
0
    def test_index_with_nan(self):
        #  GH 2850
        df = DataFrame(
            {
                "id1": {0: "1a3", 1: "9h4"},
                "id2": {0: np.nan, 1: "d67"},
                "id3": {0: "78d", 1: "79d"},
                "value": {0: 123, 1: 64},
            }
        )

        # multi-index
        y = df.set_index(["id1", "id2", "id3"])
        result = y.to_string()
        expected = u"             value\nid1 id2 id3       \n1a3 NaN 78d    123\n9h4 d67 79d     64"
        self.assert_(result == expected)

        # index
        y = df.set_index("id2")
        result = y.to_string()
        expected = u"     id1  id3  value\nid2                 \nNaN  1a3  78d    123\nd67  9h4  79d     64"
        self.assert_(result == expected)

        # all-nan in mi
        df2 = df.copy()
        df2.ix[:, "id2"] = np.nan
        y = df2.set_index("id2")
        result = y.to_string()
        expected = u"     id1  id3  value\nid2                 \nNaN  1a3  78d    123\nNaN  9h4  79d     64"
        self.assert_(result == expected)
Example #13
0
    def test_to_csv_decimal(self):
        # GH 781
        df = DataFrame({'col1': [1], 'col2': ['a'], 'col3': [10.1]})

        expected_default = ',col1,col2,col3\n0,1,a,10.1\n'
        assert df.to_csv() == expected_default

        expected_european_excel = ';col1;col2;col3\n0;1;a;10,1\n'
        assert df.to_csv(decimal=',', sep=';') == expected_european_excel

        expected_float_format_default = ',col1,col2,col3\n0,1,a,10.10\n'
        assert df.to_csv(float_format='%.2f') == expected_float_format_default

        expected_float_format = ';col1;col2;col3\n0;1;a;10,10\n'
        assert df.to_csv(decimal=',', sep=';',
                         float_format='%.2f') == expected_float_format

        # GH 11553: testing if decimal is taken into account for '0.0'
        df = pd.DataFrame({'a': [0, 1.1], 'b': [2.2, 3.3], 'c': 1})
        expected = 'a,b,c\n0^0,2^2,1\n1^1,3^3,1\n'
        assert df.to_csv(index=False, decimal='^') == expected

        # same but for an index
        assert df.set_index('a').to_csv(decimal='^') == expected

        # same for a multi-index
        assert df.set_index(['a', 'b']).to_csv(decimal="^") == expected
Example #14
0
    def parallel_cumulative_blame(self, branch='master', limit=None, skip=None, num_datapoints=None, committer=True,
                                  workers=1, ignore_globs=None, include_globs=None):
        """
        Returns the blame at every revision of interest. Index is a datetime, column per committer, with number of lines
        blamed to each committer at each timestamp as data.

        :param branch: (optional, default 'master') the branch to work in
        :param limit: (optional, default None), the maximum number of revisions to return, None for no limit
        :param skip: (optional, default None), the number of revisions to skip. Ex: skip=2 returns every other revision, None for no skipping.
        :param num_datapoints: (optional, default=None) if limit and skip are none, and this isn't, then num_datapoints evenly spaced revs will be used
        :param committer: (optional, defualt=True) true if committer should be reported, false if author
        :param ignore_globs: (optional, default=None) a list of globs to ignore, default none excludes nothing
        :param include_globs: (optinal, default=None) a list of globs to include, default of None includes everything.
        :param workers: (optional, default=1) integer, the number of workers to use in the threadpool, -1 for one per core.
        :return: DataFrame

        """

        if not _has_joblib:
            raise ImportError('''Must have joblib installed to use parallel_cumulative_blame(), please use
            cumulative_blame() instead.''')

        revs = self.revs(branch=branch, limit=limit, skip=skip, num_datapoints=num_datapoints)

        if self.verbose:
            print('Beginning processing for cumulative blame:')

        revisions = json.loads(revs.to_json(orient='index'))
        revisions = [revisions[key] for key in revisions]

        ds = Parallel(n_jobs=workers, backend='threading', verbose=5)(
            delayed(_parallel_cumulative_blame_func)
            (self, x, committer, ignore_globs, include_globs) for x in revisions
        )

        revs = DataFrame(ds)
        del revs['rev']

        revs['date'] = to_datetime(revs['date'].map(datetime.datetime.fromtimestamp))
        revs.set_index(keys=['date'], drop=True, inplace=True)
        revs = revs.fillna(0.0)

        # drop 0 cols
        for col in revs.columns.values:
            if col != 'col':
                if revs[col].sum() == 0:
                    del revs[col]

        # drop 0 rows
        keep_idx = []
        committers = [x for x in revs.columns.values if x != 'date']
        for idx, row in revs.iterrows():
            if sum([row[x] for x in committers]) > 0:
                keep_idx.append(idx)

        revs = revs.ix[keep_idx]
        revs.sort_index(ascending=False, inplace=True)

        return revs
Example #15
0
def build_dataframe(days=10, fill_value=1., values={}, end_date=dt.date.today(), date_index=True):
    ''' Constructs and returns a DataFrame in the form of those that
    are returned by Pandas DataReader. It doesn't take weekends or
    holidays into account, so weeked dates will generate values
    as well.
    
    Options are as follows:

    days: the number of rows to return. Defaults to 10
    fill_value: the value to fill each cell with (excluding date),
        defaults to 1
    values: A dictionary containing values with which to populate
        columns of the new dataframe.
        For example: values={'Adj Close': [5,6,7,8,9,10]}
        When one or more columns are specified, the number of rows in
        the new dataframe will be the length of the short column.
    end_date: The end of the range of dates comprising the
        dataframe. Takes a datetime.date. The start date is derived
        from a combination of this and the days parameter. Defaults to
        today's date.
    date_index: A boolean flag of whether the returned dataframe should
        set the date as the index (instead of the default numerical 
        index). If True, the dataframe will perfectly mimic that which
        is returned by Pandas DataReader. Default is True.

    In addition, you may specify a non OHLC column, such as RSI, and
    it will be added to the typical OHLC dataframe that gets created.
    '''
    columns = ['Open','High','Low','Close','Adj Close','Volume']


    # determine the minimum number of rows in values
    if len(values) > 0:
        # create a helper list of key/len(value) tuples
        helper = [(key, len(value)) for key, value in values.items()]
        helper.sort(key=lambda x: x[1])
        days = helper[0][1]
    else:
        ''' For some rason, values persisted across function calls
        when not declaring inside the function. I thought scoping
        rules would've deleted it after the function call, but I guess
        function parameters aren't killed?
        '''
        values = {} 
    for i in columns:
        if i in values:
            values[i] = values[i][:days] 
        else:
            values[i] = [fill_value] * days

    dateList = [end_date - dt.timedelta(days=i) for i in range(days)]
    # necessary so the dataframe flows from oldest to most recent when
    # read from top to bottom, like DataReader
    dateList.reverse()  
    values['Date'] = DatetimeIndex(dateList)
    df = DataFrame(values, index=range(days))
    if date_index == True:
        df.set_index(keys='Date', drop=True, inplace=True)
    return df
Example #16
0
    def test_set_index_cast_datetimeindex(self):
        df = DataFrame({'A': [datetime(2000, 1, 1) + timedelta(i)
                              for i in range(1000)],
                        'B': np.random.randn(1000)})

        idf = df.set_index('A')
        assert isinstance(idf.index, pd.DatetimeIndex)

        # don't cast a DatetimeIndex WITH a tz, leave as object
        # GH 6032
        i = (pd.DatetimeIndex(
            to_datetime(['2013-1-1 13:00',
                         '2013-1-2 14:00'], errors="raise"))
             .tz_localize('US/Pacific'))
        df = DataFrame(np.random.randn(2, 1), columns=['A'])

        expected = Series(np.array([pd.Timestamp('2013-01-01 13:00:00-0800',
                                                 tz='US/Pacific'),
                                    pd.Timestamp('2013-01-02 14:00:00-0800',
                                                 tz='US/Pacific')],
                                   dtype="object"))

        # convert index to series
        result = Series(i)
        assert_series_equal(result, expected)

        # assignt to frame
        df['B'] = i
        result = df['B']
        assert_series_equal(result, expected, check_names=False)
        assert result.name == 'B'

        # keep the timezone
        result = i.to_series(keep_tz=True)
        assert_series_equal(result.reset_index(drop=True), expected)

        # convert to utc
        df['C'] = i.to_series().reset_index(drop=True)
        result = df['C']
        comp = pd.DatetimeIndex(expected.values)
        comp = comp.tz_localize(None)
        tm.assert_numpy_array_equal(result.values, comp.values)

        # list of datetimes with a tz
        df['D'] = i.to_pydatetime()
        result = df['D']
        assert_series_equal(result, expected, check_names=False)
        assert result.name == 'D'

        # GH 6785
        # set the index manually
        import pytz
        df = DataFrame(
            [{'ts': datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo': 1}])
        expected = df.set_index('ts')
        df.index = df['ts']
        df.pop('ts')
        assert_frame_equal(df, expected)
Example #17
0
 def ledger(self, from_date = None, to_date = None, freq = None):
     """
     Show the cash ledger
     """
     df = DataFrame(self._cash)[self._columns]
     df.set_index("TS", inplace = True)
     df.sort_index(inplace = True)
     df['balance'] = df['A'].cumsum()
     return df.reset_index() # Hack to make decorator work
Example #18
0
 def test_join_segfault(self):
     # 1532
     df1 = DataFrame({'a': [1, 1], 'b': [1, 2], 'x': [1, 2]})
     df2 = DataFrame({'a': [2, 2], 'b': [1, 2], 'y': [1, 2]})
     df1 = df1.set_index(['a', 'b'])
     df2 = df2.set_index(['a', 'b'])
     # it works!
     for how in ['left', 'right', 'outer']:
         df1.join(df2, how=how)
Example #19
0
 def test_set_index_timezone(self):
     # GH 12358
     # tz-aware Series should retain the tz
     i = pd.to_datetime(["2014-01-01 10:10:10"],
                        utc=True).tz_convert('Europe/Rome')
     df = DataFrame({'i': i})
     assert df.set_index(i).index[0].hour == 11
     assert pd.DatetimeIndex(pd.Series(df.i))[0].hour == 11
     assert df.set_index(df.i).index[0].hour == 11
Example #20
0
def setIndexDataFrame():
    df = DataFrame({'a': range(7), 'b':range(7,0,-1),
                    'c':['one','one','one','two','two','two','two'],
                    'd':[0,1,2,0,1,2,3]})
    print (df)
    df2 = df.set_index(['c','d'])
    print (df2)
    df3 = df.set_index(['c','d'], drop=False)
    print (df3  )
Example #21
0
 def test_set_index_nonuniq(self):
     df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'],
                     'B': ['one', 'two', 'three', 'one', 'two'],
                     'C': ['a', 'b', 'c', 'd', 'e'],
                     'D': np.random.randn(5),
                     'E': np.random.randn(5)})
     with assertRaisesRegexp(ValueError, 'Index has duplicate keys'):
         df.set_index('A', verify_integrity=True, inplace=True)
     self.assertIn('A', df)
Example #22
0
 def test_date_index_query_with_NaT_duplicates(self):
     engine, parser = self.engine, self.parser
     n = 10
     df = DataFrame(np.random.randn(n, 3))
     df['dates1'] = date_range('1/1/2012', periods=n)
     df['dates3'] = date_range('1/1/2014', periods=n)
     df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT
     df.set_index('dates1', inplace=True, drop=True)
     with pytest.raises(NotImplementedError):
         df.query('index < 20130101 < dates3', engine=engine, parser=parser)
Example #23
0
    def test_period_set_index_reindex(self):
        # GH 6631
        df = DataFrame(np.random.random(6))
        idx1 = period_range('2011/01/01', periods=6, freq='M')
        idx2 = period_range('2013', periods=6, freq='A')

        df = df.set_index(idx1)
        tm.assert_index_equal(df.index, idx1)
        df = df.set_index(idx2)
        tm.assert_index_equal(df.index, idx2)
Example #24
0
    def test_append_preserve_index_name(self):
        # #980
        df1 = DataFrame(data=None, columns=['A', 'B', 'C'])
        df1 = df1.set_index(['A'])
        df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]],
                        columns=['A', 'B', 'C'])
        df2 = df2.set_index(['A'])

        result = df1.append(df2)
        self.assertEqual(result.index.name, 'A')
Example #25
0
    def test_pandas_extend_index(self):
        d1 = DataFrame(data=[2, 4, 6, 8], columns=["A"], index=[1, 2, 3, 4])
        d1.index.name = "first"

        d1["second"] = "default"
        d1.set_index(["second"], append=True, inplace=True)
        self.assertEqual(d1.index.names, ["first", "second"])

        d1 = d1.reorder_levels(["second", "first"])
        self.assertEqual(d1.index.names, ["second", "first"])
Example #26
0
def aggregate_chunks(mod_features_df, modality):
    without_info_df = mod_features_df.query('field != "info"')
    cnt_df = DataFrame([list(mod_features_df.loc[('info', 'count'), :].values)] * len(without_info_df),
                       index=without_info_df.index)
    agg_df = without_info_df * cnt_df
    agg_df = DataFrame(agg_df.sum(axis=1) / cnt_df.sum(axis=1), index=without_info_df.index)
    agg_df['modality'] = modality
    agg_df.set_index('modality', append=True, inplace=True)
    agg_df = agg_df.reorder_levels(['modality', 'field', 'feature'])
    return agg_df
Example #27
0
 def test_date_index_query(self):
     engine, parser = self.engine, self.parser
     n = 10
     df = DataFrame(np.random.randn(n, 3))
     df['dates1'] = date_range('1/1/2012', periods=n)
     df['dates3'] = date_range('1/1/2014', periods=n)
     df.set_index('dates1', inplace=True, drop=True)
     res = df.query('(index < 20130101) & (20130101 < dates3)',
                    engine=engine, parser=parser)
     expec = df[(df.index < '20130101') & ('20130101' < df.dates3)]
     assert_frame_equal(res, expec)
Example #28
0
def build_state_data(where_inner="", where_outer=""):
    """
    Generates a bar graph of complaint counts by state
    """
    query = COMPLAINTS_BY_STATE.format(where_inner, where_outer)
    cur.execute(query)
    cc_by_state = DataFrame(cur.fetchall(),
                            columns=['state', 'complaint_count'])
    cc_by_state.set_index('state', drop=False)

    return cc_by_state
Example #29
0
def _pricing_iter(csvdir, symbols, metadata, divs_splits, show_progress):
    with maybe_show_progress(symbols, show_progress,
                             label='Loading custom pricing data: ') as it:
        files = os.listdir(csvdir)
        for sid, symbol in enumerate(it):
            logger.debug('%s: sid %s' % (symbol, sid))

            try:
                fname = [fname for fname in files
                         if '%s.csv' % symbol in fname][0]
            except IndexError:
                raise ValueError("%s.csv file is not in %s" % (symbol, csvdir))

            dfr = read_csv(os.path.join(csvdir, fname),
                           parse_dates=[0],
                           infer_datetime_format=True,
                           index_col=0).sort_index()

            start_date = dfr.index[0]
            end_date = dfr.index[-1]

            # The auto_close date is the day after the last trade.
            ac_date = end_date + Timedelta(days=1)
            metadata.iloc[sid] = start_date, end_date, ac_date, symbol

            if 'split' in dfr.columns:
                tmp = 1. / dfr[dfr['split'] != 1.0]['split']
                split = DataFrame(data=tmp.index.tolist(),
                                  columns=['effective_date'])
                split['ratio'] = tmp.tolist()
                split['sid'] = sid

                splits = divs_splits['splits']
                index = Index(range(splits.shape[0],
                                    splits.shape[0] + split.shape[0]))
                split.set_index(index, inplace=True)
                divs_splits['splits'] = splits.append(split)

            if 'dividend' in dfr.columns:
                # ex_date   amount  sid record_date declared_date pay_date
                tmp = dfr[dfr['dividend'] != 0.0]['dividend']
                div = DataFrame(data=tmp.index.tolist(), columns=['ex_date'])
                div['record_date'] = NaT
                div['declared_date'] = NaT
                div['pay_date'] = NaT
                div['amount'] = tmp.tolist()
                div['sid'] = sid

                divs = divs_splits['divs']
                ind = Index(range(divs.shape[0], divs.shape[0] + div.shape[0]))
                div.set_index(ind, inplace=True)
                divs_splits['divs'] = divs.append(div)

            yield sid, dfr
Example #30
0
    def test_sort_multi_index(self):
        # GH 25775, testing that sorting by index works with a multi-index.
        df = DataFrame({'a': [3, 1, 2], 'b': [0, 0, 0],
                        'c': [0, 1, 2], 'd': list('abc')})
        result = df.set_index(list('abc')).sort_index(level=list('ba'))

        expected = DataFrame({'a': [1, 2, 3], 'b': [0, 0, 0],
                              'c': [1, 2, 0], 'd': list('bca')})
        expected = expected.set_index(list('abc'))

        tm.assert_frame_equal(result, expected)