def proximity(features, pos_columns=['x', 'y']): """Find the distance to each feature's nearest neighbor. Parameters ---------- features : DataFrame pos_columns : list of column names ['x', 'y'] by default Returns ------- proximity : DataFrame distance to each particle's nearest neighbor, indexed by particle if 'particle' column is present in input Example ------- Find the proximity of each particle to its nearest neighbor in every frame. >>> prox = t.groupby('frame').apply(proximity).reset_index() >>> avg_prox = prox.groupby('particle')['proximity'].mean() And filter the trajectories... >>> particle_nos = avg_prox[avg_prox > 20].index >>> t_filtered = t[t['particle'].isin(particle_nos)] """ leaf_size = max(1, int(np.round(np.log10(len(features))))) tree = cKDTree(features[['x', 'y']].copy(), leaf_size) proximity = tree.query(tree.data, 2)[0][:, 1] result = DataFrame({'proximity': proximity}) if 'particle' in features: result.set_index(features['particle'], inplace=True) return result
def test_reset_index_multiindex_nan(self): # GH6322, testing reset_index on MultiIndexes # when we have a nan or all nan df = DataFrame({'A': ['a', 'b', 'c'], 'B': [0, 1, np.nan], 'C': np.random.rand(3)}) rs = df.set_index(['A', 'B']).reset_index() tm.assert_frame_equal(rs, df) df = DataFrame({'A': [np.nan, 'b', 'c'], 'B': [0, 1, 2], 'C': np.random.rand(3)}) rs = df.set_index(['A', 'B']).reset_index() tm.assert_frame_equal(rs, df) df = DataFrame({'A': ['a', 'b', 'c'], 'B': [0, 1, 2], 'C': [np.nan, 1.1, 2.2]}) rs = df.set_index(['A', 'B']).reset_index() tm.assert_frame_equal(rs, df) df = DataFrame({'A': ['a', 'b', 'c'], 'B': [np.nan, np.nan, np.nan], 'C': np.random.rand(3)}) rs = df.set_index(['A', 'B']).reset_index() tm.assert_frame_equal(rs, df)
def _create_df(sheet, start_row, start_col, end_row, end_col, reindex=False): df = DataFrame(sheet[start_row+1:end_row, start_col:end_col].value, columns=sheet[start_row, start_col:end_col].value) if reindex: df.set_index(keys=df.iloc[:, 0], inplace=True) return df
def get_data(stock, start = None, end = None, interval='d'): params = dict(s=stock) format = "%Y-%m-%d" if start is not None: date = datetime.datetime.strptime(start, format) params['a'] = date.month - 1 params['b'] = date.day params['c'] = date.year if end is not None: date = datetime.datetime.strptime(end, format) params['d'] = date.month - 1 params['e'] = date.day params['f'] = date.year params['g'] = interval response = requests.get(YahooAPI.base_url, params=params) content = response.content.split('\n') headers = content[0].split(',') lines = [line.split(',') for line in content[1:-1]] # last line empty import pdb pdb.set_trace() df = DataFrame(lines, columns=headers) df['Date'] = pd.to_datetime(df['Date'], format=format) df.set_index('Date', inplace = True) return df
def test_to_csv_decimal(self): # GH 781 df = DataFrame({'col1': [1], 'col2': ['a'], 'col3': [10.1]}) expected_default = ',col1,col2,col3\n0,1,a,10.1\n' assert df.to_csv() == expected_default expected_european_excel = ';col1;col2;col3\n0;1;a;10,1\n' assert df.to_csv(decimal=',', sep=';') == expected_european_excel expected_float_format_default = ',col1,col2,col3\n0,1,a,10.10\n' assert df.to_csv(float_format='%.2f') == expected_float_format_default expected_float_format = ';col1;col2;col3\n0;1;a;10,10\n' assert df.to_csv(decimal=',', sep=';', float_format='%.2f') == expected_float_format # GH 11553: testing if decimal is taken into account for '0.0' df = pd.DataFrame({'a': [0, 1.1], 'b': [2.2, 3.3], 'c': 1}) expected = 'a,b,c\n0^0,2^2,1\n1^1,3^3,1\n' assert df.to_csv(index=False, decimal='^') == expected # same but for an index assert df.set_index('a').to_csv(decimal='^') == expected # same for a multi-index assert df.set_index(['a', 'b']).to_csv(decimal="^") == expected
def test_resample_timegrouper(): # GH 7227 dates1 = [datetime(2014, 10, 1), datetime(2014, 9, 3), datetime(2014, 11, 5), datetime(2014, 9, 5), datetime(2014, 10, 8), datetime(2014, 7, 15)] dates2 = dates1[:2] + [pd.NaT] + dates1[2:4] + [pd.NaT] + dates1[4:] dates3 = [pd.NaT] + dates1 + [pd.NaT] for dates in [dates1, dates2, dates3]: df = DataFrame(dict(A=dates, B=np.arange(len(dates)))) result = df.set_index('A').resample('M').count() exp_idx = pd.DatetimeIndex(['2014-07-31', '2014-08-31', '2014-09-30', '2014-10-31', '2014-11-30'], freq='M', name='A') expected = DataFrame({'B': [1, 0, 2, 2, 1]}, index=exp_idx) assert_frame_equal(result, expected) result = df.groupby(pd.Grouper(freq='M', key='A')).count() assert_frame_equal(result, expected) df = DataFrame(dict(A=dates, B=np.arange(len(dates)), C=np.arange( len(dates)))) result = df.set_index('A').resample('M').count() expected = DataFrame({'B': [1, 0, 2, 2, 1], 'C': [1, 0, 2, 2, 1]}, index=exp_idx, columns=['B', 'C']) assert_frame_equal(result, expected) result = df.groupby(pd.Grouper(freq='M', key='A')).count() assert_frame_equal(result, expected)
def test_index_with_nan(self): # GH 2850 df = DataFrame( { "id1": {0: "1a3", 1: "9h4"}, "id2": {0: np.nan, 1: "d67"}, "id3": {0: "78d", 1: "79d"}, "value": {0: 123, 1: 64}, } ) # multi-index y = df.set_index(["id1", "id2", "id3"]) result = y.to_string() expected = u" value\nid1 id2 id3 \n1a3 NaN 78d 123\n9h4 d67 79d 64" self.assert_(result == expected) # index y = df.set_index("id2") result = y.to_string() expected = u" id1 id3 value\nid2 \nNaN 1a3 78d 123\nd67 9h4 79d 64" self.assert_(result == expected) # all-nan in mi df2 = df.copy() df2.ix[:, "id2"] = np.nan y = df2.set_index("id2") result = y.to_string() expected = u" id1 id3 value\nid2 \nNaN 1a3 78d 123\nNaN 9h4 79d 64" self.assert_(result == expected)
def load_frame(cls, session): """ Load part of the table into a well-formatted pandas.DataFrame. session can be any object with the execute method. """ sample = cls.__table__ job = Job.__table__ result = Result.__table__ analysis = AnalysisConfiguration.__table__ control = ControlConfiguration.__table__ experiment = Experiment.__table__ stmt = select([sample.c.id, sample.c.control, result.c.point, control.c.type, control.c.direction, experiment.c.strain, job.c.preparation, job.c.sampling, job.c.projection, job.c.measure, job.c.delay, analysis.c.version]).where(and_( sample.c.result_id == result.c.id, result.c.job_id == job.c.id, job.c.analysis_id == analysis.c.id, job.c.control_id == control.c.id, job.c.experiment_id == experiment.c.id)) query = session.execute(stmt) df = DataFrame(iter(query), columns=query.keys()) df.set_index("id", inplace=True) return df
def test_construction_with_categorical_index(self): ci = tm.makeCategoricalIndex(10) # with Categorical df = DataFrame({'A': np.random.randn(10), 'B': ci.values}) idf = df.set_index('B') str(idf) tm.assert_index_equal(idf.index, ci, check_names=False) assert idf.index.name == 'B' # from a CategoricalIndex df = DataFrame({'A': np.random.randn(10), 'B': ci}) idf = df.set_index('B') str(idf) tm.assert_index_equal(idf.index, ci, check_names=False) assert idf.index.name == 'B' idf = df.set_index('B').reset_index().set_index('B') str(idf) tm.assert_index_equal(idf.index, ci, check_names=False) assert idf.index.name == 'B' new_df = idf.reset_index() new_df.index = df.B tm.assert_index_equal(new_df.index, ci, check_names=False) assert idf.index.name == 'B'
def test_dti_set_index_reindex(self): # GH 6631 df = DataFrame(np.random.random(6)) idx1 = date_range('2011/01/01', periods=6, freq='M', tz='US/Eastern') idx2 = date_range('2013', periods=6, freq='A', tz='Asia/Tokyo') df = df.set_index(idx1) tm.assert_index_equal(df.index, idx1) df = df.reindex(idx2) tm.assert_index_equal(df.index, idx2) # 11314 # with tz index = date_range(datetime(2015, 10, 1), datetime(2015, 10, 1, 23), freq='H', tz='US/Eastern') df = DataFrame(np.random.randn(24, 1), columns=['a'], index=index) new_index = date_range(datetime(2015, 10, 2), datetime(2015, 10, 2, 23), freq='H', tz='US/Eastern') # TODO: unused? result = df.set_index(new_index) # noqa assert new_index.freq == index.freq
def stats(request): stats_by = request.GET.get('by', 'category') trx = Transaction.objects.filter(amount__lt=0).exclude(category__name='Credit Card Payments') original_df = DataFrame(data=[{k: getattr(t, k) for k in ('date', 'category', 'amount')} for t in trx]) df = original_df.set_index('date').groupby('category').resample('M', how='sum') chart_df = df.reset_index()\ .pivot_table(values='amount', index=['date'], columns=['category'], aggfunc=numpy.sum)\ .replace(numpy.NaN, 0) months = [x.strftime('%Y-%m-%d') for x in chart_df.index] chart_series = [ {'name': category, 'type': 'column', 'data': [abs(float(a)) for a in amounts]} for category,amounts in chart_df.iteritems()] table_df = df.reset_index()\ .pivot_table(values='amount', index=['category'], columns=['date'], aggfunc=numpy.sum)\ .replace(numpy.NaN, 0)#.reset_index() table_data = [(category, list(amounts)) for category,amounts in chart_df.iteritems()] total_df = original_df.set_index('date').resample('M', how='sum').transpose() table_data.append(('Total', total_df.values[0])) return render_to_response('transactions/stats.html', { 'months_json': json.dumps(months), 'chart_series_json': json.dumps(chart_series), 'chart_df': chart_df, 'months': months, 'table_data': table_data, })
def calculate_top_10_solutions(self): '''calcualte all schemes and select top 10 solutions''' columns = ['name','rate','money'] if isfile( learning_progres_csv ): scheme_profit = read_csv(learning_progres_csv) else: scheme_profit = DataFrame(columns = columns) scheme_profit.set_index('name',inplace = True) with open(learning_progres_csv, 'w+') as csvfile: writer = csv.DictWriter(csvfile,delimiter=',',fieldnames = columns) writer.writeheader() csvfile.flush() for sc in self.generate_all_schemes(): if sc.name not in scheme_profit.index: e = evaluator(sc) scheme_profit.ix[sc.name] = rate,money = e.calculate() writer.writerow({'name':sc.name,'rate':rate,'money':money}) csvfile.flush() if self.log: print(sc.name + ' - ' + str(money) + ' \t rate = ' + str(rate)) else: writer.writerow({'name':sc.name,'rate':scheme_profit.rate[sc.name],'money':scheme_profit.money[sc.name]}) if self.log: print(sc.name + ' - ' + str(scheme_profit.money[sc.name]) + ' \t rate = ' + str(scheme_profit.rate[sc.name])) csvfile.flush() #TODO:write into scheme scheme_profit.sort(['money'],ascending = False) return scheme_profit[:10].to_dict()
def _parse(cls, body): matched = re.search(r'<div class="col_r" style="">(.*?)</div>', body, re.MULTILINE | re.DOTALL | re.UNICODE) if matched is None or len(matched.groups()) == 0: raise ValueError("no matched data found.") lines = matched.group(1).strip().split("\n") value_pattern = re.compile(r">(.*?)<", re.UNICODE) data_array = [] stock_name = cls._get_stock_name(body) for line in lines: if r"<tr" not in line: continue data = [] line = line.strip() for value in re.findall(value_pattern, line): value = cls._normalize(value) if isinstance(value, string_types) and len(value) == 0: continue data.append(value) if len(data) > 0: data_array.append(data) if data_array: data_array.insert(0, [stock_name] * len(data_array[0])) data_array = np.array(data_array).T df = DataFrame(data_array, columns=NETEASE_STOCK_INFO_COLUMNS) df.set_index("date", inplace=True) return df
def build_dataframe(days=10, fill_value=1., values={}, end_date=dt.date.today(), date_index=True): ''' Constructs and returns a DataFrame in the form of those that are returned by Pandas DataReader. It doesn't take weekends or holidays into account, so weeked dates will generate values as well. Options are as follows: days: the number of rows to return. Defaults to 10 fill_value: the value to fill each cell with (excluding date), defaults to 1 values: A dictionary containing values with which to populate columns of the new dataframe. For example: values={'Adj Close': [5,6,7,8,9,10]} When one or more columns are specified, the number of rows in the new dataframe will be the length of the short column. end_date: The end of the range of dates comprising the dataframe. Takes a datetime.date. The start date is derived from a combination of this and the days parameter. Defaults to today's date. date_index: A boolean flag of whether the returned dataframe should set the date as the index (instead of the default numerical index). If True, the dataframe will perfectly mimic that which is returned by Pandas DataReader. Default is True. In addition, you may specify a non OHLC column, such as RSI, and it will be added to the typical OHLC dataframe that gets created. ''' columns = ['Open','High','Low','Close','Adj Close','Volume'] # determine the minimum number of rows in values if len(values) > 0: # create a helper list of key/len(value) tuples helper = [(key, len(value)) for key, value in values.items()] helper.sort(key=lambda x: x[1]) days = helper[0][1] else: ''' For some rason, values persisted across function calls when not declaring inside the function. I thought scoping rules would've deleted it after the function call, but I guess function parameters aren't killed? ''' values = {} for i in columns: if i in values: values[i] = values[i][:days] else: values[i] = [fill_value] * days dateList = [end_date - dt.timedelta(days=i) for i in range(days)] # necessary so the dataframe flows from oldest to most recent when # read from top to bottom, like DataReader dateList.reverse() values['Date'] = DatetimeIndex(dateList) df = DataFrame(values, index=range(days)) if date_index == True: df.set_index(keys='Date', drop=True, inplace=True) return df
def parallel_cumulative_blame(self, branch='master', limit=None, skip=None, num_datapoints=None, committer=True, workers=1, ignore_globs=None, include_globs=None): """ Returns the blame at every revision of interest. Index is a datetime, column per committer, with number of lines blamed to each committer at each timestamp as data. :param branch: (optional, default 'master') the branch to work in :param limit: (optional, default None), the maximum number of revisions to return, None for no limit :param skip: (optional, default None), the number of revisions to skip. Ex: skip=2 returns every other revision, None for no skipping. :param num_datapoints: (optional, default=None) if limit and skip are none, and this isn't, then num_datapoints evenly spaced revs will be used :param committer: (optional, defualt=True) true if committer should be reported, false if author :param ignore_globs: (optional, default=None) a list of globs to ignore, default none excludes nothing :param include_globs: (optinal, default=None) a list of globs to include, default of None includes everything. :param workers: (optional, default=1) integer, the number of workers to use in the threadpool, -1 for one per core. :return: DataFrame """ if not _has_joblib: raise ImportError('''Must have joblib installed to use parallel_cumulative_blame(), please use cumulative_blame() instead.''') revs = self.revs(branch=branch, limit=limit, skip=skip, num_datapoints=num_datapoints) if self.verbose: print('Beginning processing for cumulative blame:') revisions = json.loads(revs.to_json(orient='index')) revisions = [revisions[key] for key in revisions] ds = Parallel(n_jobs=workers, backend='threading', verbose=5)( delayed(_parallel_cumulative_blame_func) (self, x, committer, ignore_globs, include_globs) for x in revisions ) revs = DataFrame(ds) del revs['rev'] revs['date'] = to_datetime(revs['date'].map(datetime.datetime.fromtimestamp)) revs.set_index(keys=['date'], drop=True, inplace=True) revs = revs.fillna(0.0) # drop 0 cols for col in revs.columns.values: if col != 'col': if revs[col].sum() == 0: del revs[col] # drop 0 rows keep_idx = [] committers = [x for x in revs.columns.values if x != 'date'] for idx, row in revs.iterrows(): if sum([row[x] for x in committers]) > 0: keep_idx.append(idx) revs = revs.ix[keep_idx] revs.sort_index(ascending=False, inplace=True) return revs
def test_set_index_cast_datetimeindex(self): df = DataFrame({'A': [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], 'B': np.random.randn(1000)}) idf = df.set_index('A') assert isinstance(idf.index, pd.DatetimeIndex) # don't cast a DatetimeIndex WITH a tz, leave as object # GH 6032 i = (pd.DatetimeIndex( to_datetime(['2013-1-1 13:00', '2013-1-2 14:00'], errors="raise")) .tz_localize('US/Pacific')) df = DataFrame(np.random.randn(2, 1), columns=['A']) expected = Series(np.array([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')], dtype="object")) # convert index to series result = Series(i) assert_series_equal(result, expected) # assignt to frame df['B'] = i result = df['B'] assert_series_equal(result, expected, check_names=False) assert result.name == 'B' # keep the timezone result = i.to_series(keep_tz=True) assert_series_equal(result.reset_index(drop=True), expected) # convert to utc df['C'] = i.to_series().reset_index(drop=True) result = df['C'] comp = pd.DatetimeIndex(expected.values) comp = comp.tz_localize(None) tm.assert_numpy_array_equal(result.values, comp.values) # list of datetimes with a tz df['D'] = i.to_pydatetime() result = df['D'] assert_series_equal(result, expected, check_names=False) assert result.name == 'D' # GH 6785 # set the index manually import pytz df = DataFrame( [{'ts': datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo': 1}]) expected = df.set_index('ts') df.index = df['ts'] df.pop('ts') assert_frame_equal(df, expected)
def test_set_index_timezone(self): # GH 12358 # tz-aware Series should retain the tz i = pd.to_datetime(["2014-01-01 10:10:10"], utc=True).tz_convert('Europe/Rome') df = DataFrame({'i': i}) assert df.set_index(i).index[0].hour == 11 assert pd.DatetimeIndex(pd.Series(df.i))[0].hour == 11 assert df.set_index(df.i).index[0].hour == 11
def ledger(self, from_date = None, to_date = None, freq = None): """ Show the cash ledger """ df = DataFrame(self._cash)[self._columns] df.set_index("TS", inplace = True) df.sort_index(inplace = True) df['balance'] = df['A'].cumsum() return df.reset_index() # Hack to make decorator work
def test_join_segfault(self): # 1532 df1 = DataFrame({'a': [1, 1], 'b': [1, 2], 'x': [1, 2]}) df2 = DataFrame({'a': [2, 2], 'b': [1, 2], 'y': [1, 2]}) df1 = df1.set_index(['a', 'b']) df2 = df2.set_index(['a', 'b']) # it works! for how in ['left', 'right', 'outer']: df1.join(df2, how=how)
def test_set_index_nonuniq(self): df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'], 'B': ['one', 'two', 'three', 'one', 'two'], 'C': ['a', 'b', 'c', 'd', 'e'], 'D': np.random.randn(5), 'E': np.random.randn(5)}) with assertRaisesRegexp(ValueError, 'Index has duplicate keys'): df.set_index('A', verify_integrity=True, inplace=True) self.assertIn('A', df)
def setIndexDataFrame(): df = DataFrame({'a': range(7), 'b':range(7,0,-1), 'c':['one','one','one','two','two','two','two'], 'd':[0,1,2,0,1,2,3]}) print (df) df2 = df.set_index(['c','d']) print (df2) df3 = df.set_index(['c','d'], drop=False) print (df3 )
def test_date_index_query_with_NaT_duplicates(self): engine, parser = self.engine, self.parser n = 10 df = DataFrame(np.random.randn(n, 3)) df['dates1'] = date_range('1/1/2012', periods=n) df['dates3'] = date_range('1/1/2014', periods=n) df.loc[np.random.rand(n) > 0.5, 'dates1'] = pd.NaT df.set_index('dates1', inplace=True, drop=True) with pytest.raises(NotImplementedError): df.query('index < 20130101 < dates3', engine=engine, parser=parser)
def test_append_preserve_index_name(self): # #980 df1 = DataFrame(data=None, columns=['A', 'B', 'C']) df1 = df1.set_index(['A']) df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], columns=['A', 'B', 'C']) df2 = df2.set_index(['A']) result = df1.append(df2) self.assertEqual(result.index.name, 'A')
def test_pandas_extend_index(self): d1 = DataFrame(data=[2, 4, 6, 8], columns=["A"], index=[1, 2, 3, 4]) d1.index.name = "first" d1["second"] = "default" d1.set_index(["second"], append=True, inplace=True) self.assertEqual(d1.index.names, ["first", "second"]) d1 = d1.reorder_levels(["second", "first"]) self.assertEqual(d1.index.names, ["second", "first"])
def test_period_set_index_reindex(self): # GH 6631 df = DataFrame(np.random.random(6)) idx1 = period_range('2011/01/01', periods=6, freq='M') idx2 = period_range('2013', periods=6, freq='A') df = df.set_index(idx1) tm.assert_index_equal(df.index, idx1) df = df.set_index(idx2) tm.assert_index_equal(df.index, idx2)
def aggregate_chunks(mod_features_df, modality): without_info_df = mod_features_df.query('field != "info"') cnt_df = DataFrame([list(mod_features_df.loc[('info', 'count'), :].values)] * len(without_info_df), index=without_info_df.index) agg_df = without_info_df * cnt_df agg_df = DataFrame(agg_df.sum(axis=1) / cnt_df.sum(axis=1), index=without_info_df.index) agg_df['modality'] = modality agg_df.set_index('modality', append=True, inplace=True) agg_df = agg_df.reorder_levels(['modality', 'field', 'feature']) return agg_df
def build_state_data(where_inner="", where_outer=""): """ Generates a bar graph of complaint counts by state """ query = COMPLAINTS_BY_STATE.format(where_inner, where_outer) cur.execute(query) cc_by_state = DataFrame(cur.fetchall(), columns=['state', 'complaint_count']) cc_by_state.set_index('state', drop=False) return cc_by_state
def test_sort_multi_index(self): # GH 25775, testing that sorting by index works with a multi-index. df = DataFrame({'a': [3, 1, 2], 'b': [0, 0, 0], 'c': [0, 1, 2], 'd': list('abc')}) result = df.set_index(list('abc')).sort_index(level=list('ba')) expected = DataFrame({'a': [1, 2, 3], 'b': [0, 0, 0], 'c': [1, 2, 0], 'd': list('bca')}) expected = expected.set_index(list('abc')) tm.assert_frame_equal(result, expected)
def test_date_index_query(self): engine, parser = self.engine, self.parser n = 10 df = DataFrame(np.random.randn(n, 3)) df['dates1'] = date_range('1/1/2012', periods=n) df['dates3'] = date_range('1/1/2014', periods=n) df.set_index('dates1', inplace=True, drop=True) res = df.query('(index < 20130101) & (20130101 < dates3)', engine=engine, parser=parser) expec = df[(df.index < '20130101') & ('20130101' < df.dates3)] assert_frame_equal(res, expec)
def _pricing_iter(csvdir, symbols, metadata, divs_splits, show_progress): with maybe_show_progress(symbols, show_progress, label='Loading custom pricing data: ') as it: files = os.listdir(csvdir) for sid, symbol in enumerate(it): logger.debug('%s: sid %s' % (symbol, sid)) try: fname = [fname for fname in files if '%s.csv' % symbol in fname][0] except IndexError: raise ValueError("%s.csv file is not in %s" % (symbol, csvdir)) dfr = read_csv(os.path.join(csvdir, fname), parse_dates=[0], infer_datetime_format=True, index_col=0).sort_index() start_date = dfr.index[0] end_date = dfr.index[-1] # The auto_close date is the day after the last trade. ac_date = end_date + Timedelta(days=1) metadata.iloc[sid] = start_date, end_date, ac_date, symbol if 'split' in dfr.columns: tmp = 1. / dfr[dfr['split'] != 1.0]['split'] split = DataFrame(data=tmp.index.tolist(), columns=['effective_date']) split['ratio'] = tmp.tolist() split['sid'] = sid splits = divs_splits['splits'] index = Index(range(splits.shape[0], splits.shape[0] + split.shape[0])) split.set_index(index, inplace=True) divs_splits['splits'] = splits.append(split) if 'dividend' in dfr.columns: # ex_date amount sid record_date declared_date pay_date tmp = dfr[dfr['dividend'] != 0.0]['dividend'] div = DataFrame(data=tmp.index.tolist(), columns=['ex_date']) div['record_date'] = NaT div['declared_date'] = NaT div['pay_date'] = NaT div['amount'] = tmp.tolist() div['sid'] = sid divs = divs_splits['divs'] ind = Index(range(divs.shape[0], divs.shape[0] + div.shape[0])) div.set_index(ind, inplace=True) divs_splits['divs'] = divs.append(div) yield sid, dfr
def transform_dataframe(self, dataframe): """ Use matplotlib to compute boxplot statistics on e.g. timeseries data. """ grouping = self.get_grouping(dataframe) group_field = self.get_group_field() header_fields = self.get_header_fields() if "series" in grouping: # Unstack so each series is a column for i in range(len(header_fields) + 1): dataframe = dataframe.unstack() groups = {col: dataframe[col] for col in dataframe.columns} if "year" in grouping: interval = "year" elif "month" in grouping: interval = "month" else: interval = None # Compute stats for each column, potentially grouped by year all_stats = [] for header, series in groups.items(): if interval: series_stats = self.boxplots_for_interval(series, interval) else: interval = None series_stats = [self.compute_boxplot(series)] series_infos = [] for series_stat in series_stats: series_info = {} if isinstance(header, tuple): value_name = header[0] col_values = header[1:] else: value_name = header col_values = [] col_names = zip(dataframe.columns.names[1:], col_values) for col_name, value in col_names: series_info[col_name] = value for stat_name, val in series_stat.items(): if stat_name == interval: series_info[stat_name] = val else: series_info[value_name + '-' + stat_name] = val series_infos.append(series_info) all_stats += series_infos dataframe = DataFrame(all_stats) if 'series' in grouping: index = header_fields + [group_field] unstack = len(header_fields) if interval: index = [interval] + index unstack += 1 else: index = [interval] unstack = 0 dataframe.set_index(index, inplace=True) dataframe.columns.name = '' for i in range(unstack): dataframe = dataframe.unstack() # Remove blank columns dataframe = dataframe.dropna(axis=1, how='all') return dataframe
def parse_ht_xml(xml, ht_fun, select=None, corr_csv=r'C:\ecan\hilltop\ht_corrections.csv'): """ Function to read a Hilltop xml file and apply a function on each individual site time series. The input to the function is a single pandas time series. The output should be a Series or DataFrame. Specific sites with specific mtypes can be passed in the form of a two column DataFrame with headers as 'site' and 'mtype'. """ ### Base parameters rem_s = 10958 * 24 * 60 * 60 corr = read_csv(corr_csv) xml_name = basename(xml) ### Select corrections corr1 = corr[corr.file_name == xml_name] ### Parse xml root = etree.iterparse(xml, tag='Measurement') ### Iterate results1 = [] for event, elem in root: ## Get data site = elem.values()[0] mtype = elem.find('DataSource').values()[0] if (select is not None): if (not isinstance(select, DataFrame)): raise ValueError( 'Make sure the input is a DataFrame with two columns!') elif all(select.columns == ['site', 'mtype']): site_check = any([ set([site, mtype ]) == set([select.loc[i].site, select.loc[i].mtype]) for i in select.index ]) if not site_check: continue # units = elem.find('DataSource').find('ItemInfo').find('Units').text site_data = [j.text.split() for j in elem.find('Data').findall('V')] ## Convert to dataframe o2 = DataFrame(site_data, columns=['date', 'val']) o2.loc[:, ['date', 'val']] = o2.loc[:, ['date', 'val']].astype(float) o2.loc[:, 'date'] = to_datetime(o2.loc[:, 'date'] - rem_s, unit='s') o2.set_index('date', inplace=True) ## Make corrections corr_index = (corr1.orig_site == site) & (corr1.orig_mtype == mtype) if any(corr_index): site, mtype = corr1.loc[ corr_index, ['new_site', 'new_mtype']].values.tolist()[0] ## Clear element from memory elem.clear() while elem.getprevious() is not None: del elem.getparent()[0] ## Do stats stats1 = ht_fun(o2, mtype, site) ## Add additional site specific columns/data # stats1.loc[:, 'site'] = site # stats1.loc[:, 'mtype'] = mtype # stats1.loc[:, 'units'] = units ## Append results1.append(stats1) ### Combine data df_out = concat(results1) ### Return return (df_out)
def test_pivot_timegrouper(self): df = DataFrame({ 'Branch': 'A A A A A A A B'.split(), 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], 'Date': [ datetime.datetime(2013, 1, 1), datetime.datetime(2013, 1, 1), datetime.datetime(2013, 10, 1), datetime.datetime(2013, 10, 2), datetime.datetime(2013, 10, 1), datetime.datetime(2013, 10, 2), datetime.datetime(2013, 12, 2), datetime.datetime(2013, 12, 2), ] }).set_index('Date') expected = DataFrame(np.array([10, 18, 3], dtype='int64').reshape(1, 3), index=[datetime.datetime(2013, 12, 31)], columns='Carl Joe Mark'.split()) expected.index.name = 'Date' expected.columns.name = 'Buyer' result = pivot_table(df, index=Grouper(freq='A'), columns='Buyer', values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected) result = pivot_table(df, index='Buyer', columns=Grouper(freq='A'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) expected = DataFrame(np.array([1, np.nan, 3, 9, 18, np.nan]).reshape(2, 3), index=[ datetime.datetime(2013, 1, 1), datetime.datetime(2013, 7, 1) ], columns='Carl Joe Mark'.split()) expected.index.name = 'Date' expected.columns.name = 'Buyer' result = pivot_table(df, index=Grouper(freq='6MS'), columns='Buyer', values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected) result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) # passing the name df = df.reset_index() result = pivot_table(df, index=Grouper(freq='6MS', key='Date'), columns='Buyer', values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected) result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', key='Date'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) self.assertRaises( KeyError, lambda: pivot_table(df, index=Grouper(freq='6MS', key='foo'), columns='Buyer', values='Quantity', aggfunc=np.sum)) self.assertRaises( KeyError, lambda: pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', key='foo'), values='Quantity', aggfunc=np.sum)) # passing the level df = df.set_index('Date') result = pivot_table(df, index=Grouper(freq='6MS', level='Date'), columns='Buyer', values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected) result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', level='Date'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) self.assertRaises( ValueError, lambda: pivot_table(df, index=Grouper(freq='6MS', level='foo'), columns='Buyer', values='Quantity', aggfunc=np.sum)) self.assertRaises( ValueError, lambda: pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', level='foo'), values='Quantity', aggfunc=np.sum)) # double grouper df = DataFrame({ 'Branch': 'A A A A A A A B'.split(), 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], 'Date': [ datetime.datetime(2013, 11, 1, 13, 0), datetime.datetime(2013, 9, 1, 13, 5), datetime.datetime(2013, 10, 1, 20, 0), datetime.datetime(2013, 10, 2, 10, 0), datetime.datetime(2013, 11, 1, 20, 0), datetime.datetime(2013, 10, 2, 10, 0), datetime.datetime(2013, 10, 2, 12, 0), datetime.datetime(2013, 12, 5, 14, 0) ], 'PayDay': [ datetime.datetime(2013, 10, 4, 0, 0), datetime.datetime(2013, 10, 15, 13, 5), datetime.datetime(2013, 9, 5, 20, 0), datetime.datetime(2013, 11, 2, 10, 0), datetime.datetime(2013, 10, 7, 20, 0), datetime.datetime(2013, 9, 5, 10, 0), datetime.datetime(2013, 12, 30, 12, 0), datetime.datetime(2013, 11, 20, 14, 0), ] }) result = pivot_table(df, index=Grouper(freq='M', key='Date'), columns=Grouper(freq='M', key='PayDay'), values='Quantity', aggfunc=np.sum) expected = DataFrame(np.array([ np.nan, 3, np.nan, np.nan, 6, np.nan, 1, 9, np.nan, 9, np.nan, np.nan, np.nan, np.nan, 3, np.nan ]).reshape(4, 4), index=[ datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31), datetime.datetime(2013, 11, 30), datetime.datetime(2013, 12, 31) ], columns=[ datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31), datetime.datetime(2013, 11, 30), datetime.datetime(2013, 12, 31) ]) expected.index.name = 'Date' expected.columns.name = 'PayDay' tm.assert_frame_equal(result, expected) result = pivot_table(df, index=Grouper(freq='M', key='PayDay'), columns=Grouper(freq='M', key='Date'), values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T) tuples = [ (datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31)), (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 9, 30)), (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 11, 30)), (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 12, 31)), (datetime.datetime(2013, 11, 30), datetime.datetime(2013, 10, 31)), (datetime.datetime(2013, 12, 31), datetime.datetime(2013, 11, 30)), ] idx = MultiIndex.from_tuples(tuples, names=['Date', 'PayDay']) expected = DataFrame(np.array( [3, np.nan, 6, np.nan, 1, np.nan, 9, np.nan, 9, np.nan, np.nan, 3]).reshape(6, 2), index=idx, columns=['A', 'B']) expected.columns.name = 'Branch' result = pivot_table(df, index=[ Grouper(freq='M', key='Date'), Grouper(freq='M', key='PayDay') ], columns=['Branch'], values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected) result = pivot_table(df, index=['Branch'], columns=[ Grouper(freq='M', key='Date'), Grouper(freq='M', key='PayDay') ], values='Quantity', aggfunc=np.sum) tm.assert_frame_equal(result, expected.T)
def test_set_index_datetime(self): # GH#3950 df = DataFrame({ "label": ["a", "a", "a", "b", "b", "b"], "datetime": [ "2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00", "2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00", ], "value": range(6), }) df.index = to_datetime(df.pop("datetime"), utc=True) df.index = df.index.tz_convert("US/Pacific") expected = DatetimeIndex( [ "2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00" ], name="datetime", ) expected = expected.tz_localize("UTC").tz_convert("US/Pacific") df = df.set_index("label", append=True) tm.assert_index_equal(df.index.levels[0], expected) tm.assert_index_equal(df.index.levels[1], Index(["a", "b"], name="label")) assert df.index.names == ["datetime", "label"] df = df.swaplevel(0, 1) tm.assert_index_equal(df.index.levels[0], Index(["a", "b"], name="label")) tm.assert_index_equal(df.index.levels[1], expected) assert df.index.names == ["label", "datetime"] df = DataFrame(np.random.random(6)) idx1 = DatetimeIndex( [ "2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00", "2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00", ], tz="US/Eastern", ) idx2 = DatetimeIndex( [ "2012-04-01 09:00", "2012-04-01 09:00", "2012-04-01 09:00", "2012-04-02 09:00", "2012-04-02 09:00", "2012-04-02 09:00", ], tz="US/Eastern", ) idx3 = date_range("2011-01-01 09:00", periods=6, tz="Asia/Tokyo") idx3 = idx3._with_freq(None) df = df.set_index(idx1) df = df.set_index(idx2, append=True) df = df.set_index(idx3, append=True) expected1 = DatetimeIndex( [ "2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00" ], tz="US/Eastern", ) expected2 = DatetimeIndex(["2012-04-01 09:00", "2012-04-02 09:00"], tz="US/Eastern") tm.assert_index_equal(df.index.levels[0], expected1) tm.assert_index_equal(df.index.levels[1], expected2) tm.assert_index_equal(df.index.levels[2], idx3) # GH#7092 tm.assert_index_equal(df.index.get_level_values(0), idx1) tm.assert_index_equal(df.index.get_level_values(1), idx2) tm.assert_index_equal(df.index.get_level_values(2), idx3)
def apply2features(df: pd.DataFrame, features: List, processor: Callable) -> pd.DataFrame: not_features = [col for col in df.columns if col not in features] return processor(df.set_index(not_features)).reset_index()
def test_to_latex_multiindex(self): df = DataFrame({("x", "y"): ["a"]}) result = df.to_latex() expected = r"""\begin{tabular}{ll} \toprule {} & x \\ {} & y \\ \midrule 0 & a \\ \bottomrule \end{tabular} """ assert result == expected result = df.T.to_latex() expected = r"""\begin{tabular}{lll} \toprule & & 0 \\ \midrule x & y & a \\ \bottomrule \end{tabular} """ assert result == expected df = DataFrame.from_dict({ ("c1", 0): pd.Series({x: x for x in range(4)}), ("c1", 1): pd.Series({x: x + 4 for x in range(4)}), ("c2", 0): pd.Series({x: x for x in range(4)}), ("c2", 1): pd.Series({x: x + 4 for x in range(4)}), ("c3", 0): pd.Series({x: x for x in range(4)}), }).T result = df.to_latex() expected = r"""\begin{tabular}{llrrrr} \toprule & & 0 & 1 & 2 & 3 \\ \midrule c1 & 0 & 0 & 1 & 2 & 3 \\ & 1 & 4 & 5 & 6 & 7 \\ c2 & 0 & 0 & 1 & 2 & 3 \\ & 1 & 4 & 5 & 6 & 7 \\ c3 & 0 & 0 & 1 & 2 & 3 \\ \bottomrule \end{tabular} """ assert result == expected # GH 14184 df = df.T df.columns.names = ["a", "b"] result = df.to_latex() expected = r"""\begin{tabular}{lrrrrr} \toprule a & \multicolumn{2}{l}{c1} & \multicolumn{2}{l}{c2} & c3 \\ b & 0 & 1 & 0 & 1 & 0 \\ \midrule 0 & 0 & 4 & 0 & 4 & 0 \\ 1 & 1 & 5 & 1 & 5 & 1 \\ 2 & 2 & 6 & 2 & 6 & 2 \\ 3 & 3 & 7 & 3 & 7 & 3 \\ \bottomrule \end{tabular} """ assert result == expected # GH 10660 df = pd.DataFrame({ "a": [0, 0, 1, 1], "b": list("abab"), "c": [1, 2, 3, 4] }) result = df.set_index(["a", "b"]).to_latex() expected = r"""\begin{tabular}{llr} \toprule & & c \\ a & b & \\ \midrule 0 & a & 1 \\ & b & 2 \\ 1 & a & 3 \\ & b & 4 \\ \bottomrule \end{tabular} """ assert result == expected result = df.groupby("a").describe().to_latex() expected = r"""\begin{tabular}{lrrrrrrrr} \toprule {} & \multicolumn{8}{l}{c} \\ {} & count & mean & std & min & 25\% & 50\% & 75\% & max \\ a & & & & & & & & \\ \midrule 0 & 2.0 & 1.5 & 0.707107 & 1.0 & 1.25 & 1.5 & 1.75 & 2.0 \\ 1 & 2.0 & 3.5 & 0.707107 & 3.0 & 3.25 & 3.5 & 3.75 & 4.0 \\ \bottomrule \end{tabular} """ assert result == expected
def test_set_index2(self): df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'], 'B': ['one', 'two', 'three', 'one', 'two'], 'C': ['a', 'b', 'c', 'd', 'e'], 'D': np.random.randn(5), 'E': np.random.randn(5)}) # new object, single-column result = df.set_index('C') result_nodrop = df.set_index('C', drop=False) index = Index(df['C'], name='C') expected = df.loc[:, ['A', 'B', 'D', 'E']] expected.index = index expected_nodrop = df.copy() expected_nodrop.index = index assert_frame_equal(result, expected) assert_frame_equal(result_nodrop, expected_nodrop) self.assertEqual(result.index.name, index.name) # inplace, single df2 = df.copy() df2.set_index('C', inplace=True) assert_frame_equal(df2, expected) df3 = df.copy() df3.set_index('C', drop=False, inplace=True) assert_frame_equal(df3, expected_nodrop) # create new object, multi-column result = df.set_index(['A', 'B']) result_nodrop = df.set_index(['A', 'B'], drop=False) index = MultiIndex.from_arrays([df['A'], df['B']], names=['A', 'B']) expected = df.loc[:, ['C', 'D', 'E']] expected.index = index expected_nodrop = df.copy() expected_nodrop.index = index assert_frame_equal(result, expected) assert_frame_equal(result_nodrop, expected_nodrop) self.assertEqual(result.index.names, index.names) # inplace df2 = df.copy() df2.set_index(['A', 'B'], inplace=True) assert_frame_equal(df2, expected) df3 = df.copy() df3.set_index(['A', 'B'], drop=False, inplace=True) assert_frame_equal(df3, expected_nodrop) # corner case with assertRaisesRegexp(ValueError, 'Index has duplicate keys'): df.set_index('A', verify_integrity=True) # append result = df.set_index(['A', 'B'], append=True) xp = df.reset_index().set_index(['index', 'A', 'B']) xp.index.names = [None, 'A', 'B'] assert_frame_equal(result, xp) # append to existing multiindex rdf = df.set_index(['A'], append=True) rdf = rdf.set_index(['B', 'C'], append=True) expected = df.set_index(['A', 'B', 'C'], append=True) assert_frame_equal(rdf, expected) # Series result = df.set_index(df.C) self.assertEqual(result.index.name, 'C')
def _data_to_frame(data, header, index_col, infer_types, skiprows): """Parse a BeautifulSoup table into a DataFrame. Parameters ---------- data : tuple of lists The raw data to be placed into a DataFrame. This is a list of lists of strings or unicode. If it helps, it can be thought of as a matrix of strings instead. header : int or None An integer indicating the row to use for the column header or None indicating no header will be used. index_col : int or None An integer indicating the column to use for the index or None indicating no column will be used. infer_types : bool Whether to convert numbers and dates. skiprows : collections.Container or int or slice Iterable used to skip rows. Returns ------- df : DataFrame A DataFrame containing the data from `data` Raises ------ ValueError * If `skiprows` is not found in the rows of the parsed DataFrame. Raises ------ ValueError * If `skiprows` is not found in the rows of the parsed DataFrame. See Also -------- read_html Notes ----- The `data` parameter is guaranteed not to be a list of empty lists. """ thead, tbody, tfoot = data columns = thead or None df = DataFrame(tbody, columns=columns) if skiprows is not None: it = _get_skiprows_iter(skiprows) try: df = df.drop(it) except ValueError: raise ValueError('Labels {0} not found when trying to skip' ' rows'.format(it)) # convert to numbers/dates where possible # must be sequential since dates trump numbers if both args are given if infer_types: df = df.convert_objects(convert_numeric=True) df = df.convert_objects(convert_dates='coerce') if header is not None: header_rows = df.iloc[header] if header_rows.ndim == 2: names = header_rows.index df.columns = MultiIndex.from_arrays(header_rows.values, names=names) else: df.columns = header_rows df = df.drop(df.index[header]) if index_col is not None: cols = df.columns[index_col] try: cols = cols.tolist() except AttributeError: pass # drop by default df.set_index(cols, inplace=True) if df.index.nlevels == 1: if isnull(df.index.name) or not df.index.name: df.index.name = None else: names = [name or None for name in df.index.names] df.index = MultiIndex.from_tuples(df.index.values, names=names) return df
def compute_fractal(begin_date, end_date): codes = get_all_codes() # codes = ['000151'] # 计算每个股票的信号 for index, code in enumerate(codes): try: # 获取后复权的价格,使用后复权的价格计算分型信号 daily_cursor = DB_CONN['daily_hfq'].find( { 'code': code, 'date': { '$gte': begin_date, '$lte': end_date } }, sort=[('date', ASCENDING)], projection={ 'date': True, 'high': True, 'low': True, '_id': False }) df_daily = DataFrame([daily for daily in daily_cursor]) # 设置日期作为索引 df_daily.set_index(['date'], inplace=True) # 通过shift,将前两天和后两天对齐到中间一天 df_daily_shift_1 = df_daily.shift(1) df_daily_shift_2 = df_daily.shift(2) df_daily_shift_3 = df_daily.shift(3) df_daily_shift_4 = df_daily.shift(4) # 顶分型,中间日的最高价既大于前两天的最高价,也大于后两天的最高价 df_daily['up'] = (df_daily_shift_3['high'] > df_daily_shift_1['high']) & \ (df_daily_shift_3['high'] > df_daily_shift_2['high']) & \ (df_daily_shift_3['high'] > df_daily_shift_4['high']) & \ (df_daily_shift_3['high'] > df_daily['high']) # 底分型,中间日的最低价既小于前两天的最低价,也小于后两天的最低价 df_daily['down'] = (df_daily_shift_3['low'] < df_daily_shift_1['low']) & \ (df_daily_shift_3['low'] < df_daily_shift_2['low']) & \ (df_daily_shift_3['low'] < df_daily_shift_4['low']) & \ (df_daily_shift_3['low'] < df_daily['low']) # 只保留了出现顶分型和低分型信号的日期, 其他数据全部舍弃 df_daily = df_daily[(df_daily['up'] | df_daily['down'])] # 抛掉不用的数据 df_daily.drop(['high', 'low'], axis=1, inplace=True) # print(df_daily) ''' up down date 2019-05-15 False True 2019-05-16 True False 2019-05-20 True False 2019-05-23 False True ''' # 将信号保存到数据库 , update_requests = [] # 保存的数据结果时,code、date和信号的方向 for date in df_daily.index: doc = { 'code': code, 'date': date, # up: 顶分型, down:底分型 'direction': 'up' if df_daily.loc[date]['up'] else 'down' } # 保存时以code、date和direction做条件,那么就需要在这三个字段上建立索引 # db.fractal_signal.createIndex({'code': 1, 'date': 1, 'direction': 1}) update_requests.append( UpdateOne(doc, {'$set': doc}, upsert=True)) if len(update_requests) > 0: update_result = DB_CONN['fractal'].bulk_write(update_requests, ordered=False) print('Save Fractal, 第%d个, 股票代码:%s, 插入:%4d, 更新:%4d' % (index + 1, code, update_result.upserted_count, update_result.modified_count), flush=True) except: print('错误发生: %s' % code, flush=True) traceback.print_exc()
def test_equals(self): s1 = pd.Series([1, 2, 3], index=[0, 2, 1]) s2 = s1.copy() assert s1.equals(s2) s1[1] = 99 assert not s1.equals(s2) # NaNs compare as equal s1 = pd.Series([1, np.nan, 3, np.nan], index=[0, 2, 1, 3]) s2 = s1.copy() assert s1.equals(s2) s2[0] = 9.9 assert not s1.equals(s2) idx = MultiIndex.from_tuples([(0, 'a'), (1, 'b'), (2, 'c')]) s1 = Series([1, 2, np.nan], index=idx) s2 = s1.copy() assert s1.equals(s2) # Add object dtype column with nans index = np.random.random(10) df1 = DataFrame(np.random.random(10, ), index=index, columns=['floats']) df1['text'] = 'the sky is so blue. we could use more chocolate.'.split( ) df1['start'] = date_range('2000-1-1', periods=10, freq='T') df1['end'] = date_range('2000-1-1', periods=10, freq='D') df1['diff'] = df1['end'] - df1['start'] df1['bool'] = (np.arange(10) % 3 == 0) df1.loc[::2] = np.nan df2 = df1.copy() assert df1['text'].equals(df2['text']) assert df1['start'].equals(df2['start']) assert df1['end'].equals(df2['end']) assert df1['diff'].equals(df2['diff']) assert df1['bool'].equals(df2['bool']) assert df1.equals(df2) assert not df1.equals(object) # different dtype different = df1.copy() different['floats'] = different['floats'].astype('float32') assert not df1.equals(different) # different index different_index = -index different = df2.set_index(different_index) assert not df1.equals(different) # different columns different = df2.copy() different.columns = df2.columns[::-1] assert not df1.equals(different) # DatetimeIndex index = pd.date_range('2000-1-1', periods=10, freq='T') df1 = df1.set_index(index) df2 = df1.copy() assert df1.equals(df2) # MultiIndex df3 = df1.set_index(['text'], append=True) df2 = df1.set_index(['text'], append=True) assert df3.equals(df2) df2 = df1.set_index(['floats'], append=True) assert not df3.equals(df2) # NaN in index df3 = df1.set_index(['floats'], append=True) df2 = df1.set_index(['floats'], append=True) assert df3.equals(df2) # GH 8437 a = pd.Series([False, np.nan]) b = pd.Series([False, np.nan]) c = pd.Series(index=range(2)) d = pd.Series(index=range(2)) e = pd.Series(index=range(2)) f = pd.Series(index=range(2)) c[:-1] = d[:-1] = e[0] = f[0] = False assert a.equals(a) assert a.equals(b) assert a.equals(c) assert a.equals(d) assert a.equals(e) assert e.equals(f)
def __init__(self, ps: ParamStore, camp: str, profile: pd.DataFrame, profile_override_dict={}): self.ps = ps self.camp = camp disease_params = ps.get_disease_params() camp_params = ps.get_camp_params(camp) # ------------------------------------------------------------ # disease params parameter_csv = disease_params model_params = parameter_csv[parameter_csv['Type'] == 'Model Parameter'] model_params = model_params.loc[:, ['Name', 'Value']] control_data = parameter_csv[parameter_csv['Type'] == 'Control'] self.model_params = model_params profile.set_index('Parameter', inplace=True) self.number_of_people_in_isoboxes = int( profile.loc['number_of_people_in_isoboxes', 'Value']) self.number_of_people_in_one_isobox = int( profile.loc['number_of_people_in_one_isobox', 'Value']) self.number_of_isoboxes = self.number_of_people_in_isoboxes / \ self.number_of_people_in_one_isobox self.number_of_people_in_tents = int( profile.loc['number_of_people_in_tents', 'Value']) self.number_of_people_in_one_tent = int( profile.loc['number_of_people_in_one_tent', 'Value']) self.number_of_tents = self.number_of_people_in_tents / \ self.number_of_people_in_one_tent self.total_population = self.number_of_people_in_isoboxes + \ self.number_of_people_in_tents # float(profile.loc['permanently_asymptomatic_cases','Value']) self.permanently_asymptomatic_cases = 0.179 self.age_and_gender = abm.read_age_gender(self.total_population) # float(profile.loc['area_covered_by_isoboxes','Value']) self.area_covered_by_isoboxes = 0.5 # float(profile.loc['relative_strength_of_interaction','Value']) self.relative_strength_of_interaction = 0.2 # float(profile.loc['smaller_movement_radius','Value']) self.smaller_movement_radius = 0.02 # float(profile.loc['larger_movement_radius','Value']) self.larger_movement_radius = 0.1 # float(profile.loc['overlapping_rages_radius','Value']) self.overlapping_rages_radius = 0.02 self.number_of_steps = int(profile.loc['number_of_steps', 'Value']) self.number_of_states = 14 self.track_states = np.zeros( (self.number_of_steps, self.number_of_states)) self.ACTIVATE_INTERVENTION = profile.loc['ACTIVATE_INTERVENTION', 'Value'] # int(profile.loc['total_number_of_hospitalized','Value']) self.total_number_of_hospitalized = 0 self.num_toilet_visit = int(profile.loc['num_toilet_visit', 'Value']) self.num_toilet_contact = int(profile.loc['num_toilet_contact', 'Value']) self.num_food_visit = int(profile.loc['num_food_visit', 'Value']) self.num_food_contact = int(profile.loc['num_food_contact', 'Value']) self.pct_food_visit = float(profile.loc['pct_food_visit', 'Value']) # float(profile.loc['transmission_reduction','Value']) self.transmission_reduction = 1 # float(profile.loc['probability_infecting_person_in_household_per_day','Value']) self.probability_infecting_person_in_household_per_day = 0.33 # float(profile.loc['probability_infecting_person_in_foodline_per_day','Value']) self.probability_infecting_person_in_foodline_per_day = 0.407 # float(profile.loc['probability_infecting_person_in_toilet_per_day','Value']) self.probability_infecting_person_in_toilet_per_day = 0.099 # float(profile.loc['probability_infecting_person_in_moving_per_day','Value']) self.probability_infecting_person_in_moving_per_day = 0.017 # float(profile.loc['probability_spotting_symptoms_per_day','Value']) self.probability_spotting_symptoms_per_day = 0.05 self.clearday = int(profile.loc['clearday', 'Value']) tb = profile.loc['toilets_blocks', 'Value'].split(',') self.toilets_blocks = (int(tb[0]), int(tb[1])) fb = profile.loc['foodline_blocks', 'Value'].split(',') self.foodline_blocks = (int(fb[0]), int(fb[1])) self.population = abm.form_population_matrix( self.total_population, self.number_of_isoboxes, self.number_of_people_in_isoboxes, self.number_of_tents, self.number_of_people_in_tents, self.permanently_asymptomatic_cases, self.age_and_gender) self.households_location = abm.place_households( self.population[:, 0].astype(int), self.area_covered_by_isoboxes, self.number_of_isoboxes) self.toilets_location, self.toilets_numbers, self.toilets_sharing = \ abm.position_toilet(self.households_location, self.toilets_blocks[0], self.toilets_blocks[1]) self.foodpoints_location, self.foodpoints_numbers, self.foodpoints_sharing = \ abm.position_foodline( self.households_location, self.foodline_blocks[0], self.foodline_blocks[1]) self.ethnical_corellations = abm.create_ethnic_groups( self.households_location, self.relative_strength_of_interaction) self.local_interaction_space = abm.interaction_neighbours( self.households_location, self.smaller_movement_radius, self.larger_movement_radius, self.overlapping_rages_radius, self.ethnical_corellations) self.control_dict = {}
def parse_table_schema(json, precise_float): """ Builds a DataFrame from a given schema Parameters ---------- json : A JSON table schema precise_float : boolean Flag controlling precision when decoding string to double values, as dictated by ``read_json`` Returns ------- df : DataFrame Raises ------ NotImplementedError If the JSON table schema contains either timezone or timedelta data Notes ----- Because :func:`DataFrame.to_json` uses the string 'index' to denote a name-less :class:`Index`, this function sets the name of the returned :class:`DataFrame` to ``None`` when said string is encountered with a normal :class:`Index`. For a :class:`MultiIndex`, the same limitation applies to any strings beginning with 'level_'. Therefore, an :class:`Index` name of 'index' and :class:`MultiIndex` names starting with 'level_' are not supported. See Also -------- build_table_schema : Inverse function. pandas.read_json """ table = loads(json, precise_float=precise_float) col_order = [field["name"] for field in table["schema"]["fields"]] df = DataFrame(table["data"], columns=col_order)[col_order] dtypes = { field["name"]: convert_json_field_to_pandas_type(field) for field in table["schema"]["fields"] } # No ISO constructor for Timedelta as of yet, so need to raise if "timedelta64" in dtypes.values(): raise NotImplementedError( 'table="orient" can not yet read ISO-formatted Timedelta data') df = df.astype(dtypes) if "primaryKey" in table["schema"]: df = df.set_index(table["schema"]["primaryKey"]) if len(df.index.names) == 1: if df.index.name == "index": df.index.name = None else: df.index.names = [ None if x.startswith("level_") else x for x in df.index.names ] return df
def test_set_index_nan(self): # GH 3586 df = DataFrame({ 'PRuid': { 17: 'nonQC', 18: 'nonQC', 19: 'nonQC', 20: '10', 21: '11', 22: '12', 23: '13', 24: '24', 25: '35', 26: '46', 27: '47', 28: '48', 29: '59', 30: '10' }, 'QC': { 17: 0.0, 18: 0.0, 19: 0.0, 20: np.nan, 21: np.nan, 22: np.nan, 23: np.nan, 24: 1.0, 25: np.nan, 26: np.nan, 27: np.nan, 28: np.nan, 29: np.nan, 30: np.nan }, 'data': { 17: 7.9544899999999998, 18: 8.0142609999999994, 19: 7.8591520000000008, 20: 0.86140349999999999, 21: 0.87853110000000001, 22: 0.8427041999999999, 23: 0.78587700000000005, 24: 0.73062459999999996, 25: 0.81668560000000001, 26: 0.81927080000000008, 27: 0.80705009999999999, 28: 0.81440240000000008, 29: 0.80140849999999997, 30: 0.81307740000000006 }, 'year': { 17: 2006, 18: 2007, 19: 2008, 20: 1985, 21: 1985, 22: 1985, 23: 1985, 24: 1985, 25: 1985, 26: 1985, 27: 1985, 28: 1985, 29: 1985, 30: 1986 } }).reset_index() result = df.set_index(['year', 'PRuid', 'QC']).reset_index().reindex(columns=df.columns) tm.assert_frame_equal(result, df)
def test_set_index_cast_datetimeindex(self): df = DataFrame({'A': [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], 'B': np.random.randn(1000)}) idf = df.set_index('A') tm.assertIsInstance(idf.index, pd.DatetimeIndex) # don't cast a DatetimeIndex WITH a tz, leave as object # GH 6032 i = (pd.DatetimeIndex( pd.tseries.tools.to_datetime(['2013-1-1 13:00', '2013-1-2 14:00'], errors="raise")) .tz_localize('US/Pacific')) df = DataFrame(np.random.randn(2, 1), columns=['A']) expected = Series(np.array([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')], dtype="object")) # convert index to series result = Series(i) assert_series_equal(result, expected) # assignt to frame df['B'] = i result = df['B'] assert_series_equal(result, expected, check_names=False) self.assertEqual(result.name, 'B') # keep the timezone result = i.to_series(keep_tz=True) assert_series_equal(result.reset_index(drop=True), expected) # convert to utc df['C'] = i.to_series().reset_index(drop=True) result = df['C'] comp = pd.DatetimeIndex(expected.values).copy() comp.tz = None self.assert_numpy_array_equal(result.values, comp.values) # list of datetimes with a tz df['D'] = i.to_pydatetime() result = df['D'] assert_series_equal(result, expected, check_names=False) self.assertEqual(result.name, 'D') # GH 6785 # set the index manually import pytz df = DataFrame( [{'ts': datetime(2014, 4, 1, tzinfo=pytz.utc), 'foo': 1}]) expected = df.set_index('ts') df.index = df['ts'] df.pop('ts') assert_frame_equal(df, expected) # GH 3950 # reset_index with single level for tz in ['UTC', 'Asia/Tokyo', 'US/Eastern']: idx = pd.date_range('1/1/2011', periods=5, freq='D', tz=tz, name='idx') df = pd.DataFrame( {'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E']}, index=idx) expected = pd.DataFrame({'idx': [datetime(2011, 1, 1), datetime(2011, 1, 2), datetime(2011, 1, 3), datetime(2011, 1, 4), datetime(2011, 1, 5)], 'a': range(5), 'b': ['A', 'B', 'C', 'D', 'E']}, columns=['idx', 'a', 'b']) expected['idx'] = expected['idx'].apply( lambda d: pd.Timestamp(d, tz=tz)) assert_frame_equal(df.reset_index(), expected)
def test_to_csv_quoting(self): df = DataFrame({ 'c_bool': [True, False], 'c_float': [1.0, 3.2], 'c_int': [42, np.nan], 'c_string': ['a', 'b,c'], }) expected_rows = [ ',c_bool,c_float,c_int,c_string', '0,True,1.0,42.0,a', '1,False,3.2,,"b,c"' ] expected = tm.convert_rows_list_to_csv_str(expected_rows) result = df.to_csv() assert result == expected result = df.to_csv(quoting=None) assert result == expected expected_rows = [ ',c_bool,c_float,c_int,c_string', '0,True,1.0,42.0,a', '1,False,3.2,,"b,c"' ] expected = tm.convert_rows_list_to_csv_str(expected_rows) result = df.to_csv(quoting=csv.QUOTE_MINIMAL) assert result == expected expected_rows = [ '"","c_bool","c_float","c_int","c_string"', '"0","True","1.0","42.0","a"', '"1","False","3.2","","b,c"' ] expected = tm.convert_rows_list_to_csv_str(expected_rows) result = df.to_csv(quoting=csv.QUOTE_ALL) assert result == expected # see gh-12922, gh-13259: make sure changes to # the formatters do not break this behaviour expected_rows = [ '"","c_bool","c_float","c_int","c_string"', '0,True,1.0,42.0,"a"', '1,False,3.2,"","b,c"' ] expected = tm.convert_rows_list_to_csv_str(expected_rows) result = df.to_csv(quoting=csv.QUOTE_NONNUMERIC) assert result == expected msg = "need to escape, but no escapechar set" with pytest.raises(csv.Error, match=msg): df.to_csv(quoting=csv.QUOTE_NONE) with pytest.raises(csv.Error, match=msg): df.to_csv(quoting=csv.QUOTE_NONE, escapechar=None) expected_rows = [ ',c_bool,c_float,c_int,c_string', '0,True,1.0,42.0,a', '1,False,3.2,,b!,c' ] expected = tm.convert_rows_list_to_csv_str(expected_rows) result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar='!') assert result == expected expected_rows = [ ',c_bool,c_ffloat,c_int,c_string', '0,True,1.0,42.0,a', '1,False,3.2,,bf,c' ] expected = tm.convert_rows_list_to_csv_str(expected_rows) result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar='f') assert result == expected # see gh-3503: quoting Windows line terminators # presents with encoding? text_rows = ['a,b,c', '1,"test \r\n",3'] text = tm.convert_rows_list_to_csv_str(text_rows) df = pd.read_csv(StringIO(text)) buf = StringIO() df.to_csv(buf, encoding='utf-8', index=False) assert buf.getvalue() == text # xref gh-7791: make sure the quoting parameter is passed through # with multi-indexes df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) df = df.set_index(['a', 'b']) expected_rows = ['"a","b","c"', '"1","3","5"', '"2","4","6"'] expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv(quoting=csv.QUOTE_ALL) == expected
index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],[1, 2, 3, 1, 2, 3, 1, 2, 2, 3]]) data.index data['b'] data['b':'c'] data.ix[['b','d']] data.unstack() data.unstack().stack() frame = DataFrame(np.arange(12).reshape((4,3)), index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]], columns=[['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']]) frame.index.names = ['key1', 'key2'] frame.columns.names = ['state', 'color'] frame['Ohio'] #Reordering and sorting levels frame.swaplevel('key1', 'key2') #summary by level frame.sum(level = 'key2') frame.sum(level = 'color', axis=1) #Using a dataframe's columns frame = DataFrame({'a': range(7), 'b': range(7, 0, -1), 'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'], 'd': [0, 1, 2, 0, 1, 2, 3]}) frame2 = frame.set_index(['c','d']) frame.set_index(['c','d'], drop=False) frame2.reset_index()
def test_groupby_resample_interpolate(): # GH 35325 d = {"price": [10, 11, 9], "volume": [50, 60, 50]} df = DataFrame(d) df["week_starting"] = date_range("01/01/2018", periods=3, freq="W") result = ( df.set_index("week_starting") .groupby("volume") .resample("1D") .interpolate(method="linear") ) msg = "containing strings is deprecated" with tm.assert_produces_warning(FutureWarning, match=msg): expected_ind = pd.MultiIndex.from_tuples( [ (50, "2018-01-07"), (50, Timestamp("2018-01-08")), (50, Timestamp("2018-01-09")), (50, Timestamp("2018-01-10")), (50, Timestamp("2018-01-11")), (50, Timestamp("2018-01-12")), (50, Timestamp("2018-01-13")), (50, Timestamp("2018-01-14")), (50, Timestamp("2018-01-15")), (50, Timestamp("2018-01-16")), (50, Timestamp("2018-01-17")), (50, Timestamp("2018-01-18")), (50, Timestamp("2018-01-19")), (50, Timestamp("2018-01-20")), (50, Timestamp("2018-01-21")), (60, Timestamp("2018-01-14")), ], names=["volume", "week_starting"], ) expected = DataFrame( data={ "price": [ 10.0, 9.928571428571429, 9.857142857142858, 9.785714285714286, 9.714285714285714, 9.642857142857142, 9.571428571428571, 9.5, 9.428571428571429, 9.357142857142858, 9.285714285714286, 9.214285714285714, 9.142857142857142, 9.071428571428571, 9.0, 11.0, ], "volume": [50.0] * 15 + [60], }, index=expected_ind, ) tm.assert_frame_equal(result, expected)
def test_reset_index_level(self): df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "C", "D"]) for levels in ["A", "B"], [0, 1]: # With MultiIndex result = df.set_index(["A", "B"]).reset_index(level=levels[0]) tm.assert_frame_equal(result, df.set_index("B")) result = df.set_index(["A", "B"]).reset_index(level=levels[:1]) tm.assert_frame_equal(result, df.set_index("B")) result = df.set_index(["A", "B"]).reset_index(level=levels) tm.assert_frame_equal(result, df) result = df.set_index(["A", "B"]).reset_index(level=levels, drop=True) tm.assert_frame_equal(result, df[["C", "D"]]) # With single-level Index (GH 16263) result = df.set_index("A").reset_index(level=levels[0]) tm.assert_frame_equal(result, df) result = df.set_index("A").reset_index(level=levels[:1]) tm.assert_frame_equal(result, df) result = df.set_index(["A"]).reset_index(level=levels[0], drop=True) tm.assert_frame_equal(result, df[["B", "C", "D"]]) # Missing levels - for both MultiIndex and single-level Index: for idx_lev in ["A", "B"], ["A"]: with pytest.raises(KeyError, match=r"(L|l)evel \(?E\)?"): df.set_index(idx_lev).reset_index(level=["A", "E"]) with pytest.raises(IndexError, match="Too many levels"): df.set_index(idx_lev).reset_index(level=[0, 1, 2])
#print(rows) conn.close() # In[46]: import pandas as pd from pandas import Series, DataFrame db1=DataFrame(db) db1['datetime']=db1['date_index'].apply(lambda x: pd.to_datetime(str(x), format='%Y%m%d%H%M%S')) #db1에 datetime 이라는 index를 설정해주기 위해 datetime 이라는 열을 설정 # .apply(lambda x: ~~~ 의 의미는 내가 x를 다룰 건데, ~~~ 이런식으로 할꺼야 라는 뜻 # %Y%m%d 형식으로 된 X를 pandas의 to_datetime 함수를 통해 datetime object로 변환 db1['message_num']=1 #메세지 수를 합산하기 위해 만든 컬럼 db1.set_index(db1['datetime'], inplace=True) #datetime 컬럼을 index로 만듬 db1=db1.drop('datetime',1) #기존에 만들었던 datetime 컬럼을 삭제 db1=db1.drop('date_index', 1) db1=db1.drop('date', 1) db1 # <a id='the_destination2'></a> # ## 1.2 DB 인덱스 수정 # In[47]: import pymysql.cursors import numpy as np conn = pymysql.connect(host='169.56.124.93', user='******' , password='******', charset='utf8')
def test_multiindex_assignment(self): # GH3777 part 2 # mixed dtype df = DataFrame( np.random.randint(5, 10, size=9).reshape(3, 3), columns=list("abc"), index=[[4, 4, 8], [8, 10, 12]], ) df["d"] = np.nan arr = np.array([0.0, 1.0]) df.loc[4, "d"] = arr tm.assert_series_equal(df.loc[4, "d"], Series(arr, index=[8, 10], name="d")) # single dtype df = DataFrame( np.random.randint(5, 10, size=9).reshape(3, 3), columns=list("abc"), index=[[4, 4, 8], [8, 10, 12]], ) df.loc[4, "c"] = arr exp = Series(arr, index=[8, 10], name="c", dtype="float64") tm.assert_series_equal(df.loc[4, "c"], exp) # scalar ok df.loc[4, "c"] = 10 exp = Series(10, index=[8, 10], name="c", dtype="float64") tm.assert_series_equal(df.loc[4, "c"], exp) # invalid assignments with pytest.raises(ValueError): df.loc[4, "c"] = [0, 1, 2, 3] with pytest.raises(ValueError): df.loc[4, "c"] = [0] # groupby example NUM_ROWS = 100 NUM_COLS = 10 col_names = [ "A" + num for num in map(str, np.arange(NUM_COLS).tolist()) ] index_cols = col_names[:5] df = DataFrame( np.random.randint(5, size=(NUM_ROWS, NUM_COLS)), dtype=np.int64, columns=col_names, ) df = df.set_index(index_cols).sort_index() grp = df.groupby(level=index_cols[:4]) df["new_col"] = np.nan f_index = np.arange(5) def f(name, df2): return Series(np.arange(df2.shape[0]), name=df2.index.values[0]).reindex(f_index) # TODO(wesm): unused? # new_df = pd.concat([f(name, df2) for name, df2 in grp], axis=1).T # we are actually operating on a copy here # but in this case, that's ok for name, df2 in grp: new_vals = np.arange(df2.shape[0]) df.loc[name, "new_col"] = new_vals
corr_prob_ret_sim = np.append(corr_prob_ret_sim, ret_versus_prob, axis=0) sim += 1 elapsed_time = time.time() - start_time print('Tempo de simulação:', elapsed_time) return performance_modelo, ret_medio_ibov_sim, ret_medio_port_sim, ret_medio_port_long_sim, ret_medio_port_short_sim, corr_prob_ret_sim, datas_teste_sim # carrega do IBOVESPA e dados históricos compomentes = ler_base_componetes() base_total = carrega_dados() datas = DataFrame(base_total['data'].drop_duplicates().values, columns=['data']) datas = datas.set_index(['data']) datas = datas.sort_index(axis=0) limit_inf = '19990202 18:00:000' limit_sup = '20171230 18:00:000' datas = datas.loc[limit_inf:limit_sup] datas = datas.sort_index(axis=0) datas = datas.reset_index(['data']) # lista de variáveis para o modelo (features) e para aplicação de logaritmo (cols) #features = ['data', 'codigo', 'retorno', 'acao_close', 'roe', 'pl', 'irf', 'sharpe', 'petroleo_close', 'dolar_close','dji_close', 'sp500_close', 'risco_brasil', 'ibov_fut_close'] #cols = ['acao_close', 'petroleo_close', 'dolar_close', 'dji_close', 'sp500_close', 'risco_brasil', 'ibov_fut_close'] features = [ 'data', 'codigo', 'retorno', 'acao_close', 'roe', 'sharpe', 'dolar_close', 'sp500_close', 'ibov_fut_close' ] cols = ['acao_close', 'dolar_close', 'sp500_close', 'ibov_fut_close']
def test_set_index_nan(self): # GH 3586 df = DataFrame( { "PRuid": { 17: "nonQC", 18: "nonQC", 19: "nonQC", 20: "10", 21: "11", 22: "12", 23: "13", 24: "24", 25: "35", 26: "46", 27: "47", 28: "48", 29: "59", 30: "10", }, "QC": { 17: 0.0, 18: 0.0, 19: 0.0, 20: np.nan, 21: np.nan, 22: np.nan, 23: np.nan, 24: 1.0, 25: np.nan, 26: np.nan, 27: np.nan, 28: np.nan, 29: np.nan, 30: np.nan, }, "data": { 17: 7.9544899999999998, 18: 8.0142609999999994, 19: 7.8591520000000008, 20: 0.86140349999999999, 21: 0.87853110000000001, 22: 0.8427041999999999, 23: 0.78587700000000005, 24: 0.73062459999999996, 25: 0.81668560000000001, 26: 0.81927080000000008, 27: 0.80705009999999999, 28: 0.81440240000000008, 29: 0.80140849999999997, 30: 0.81307740000000006, }, "year": { 17: 2006, 18: 2007, 19: 2008, 20: 1985, 21: 1985, 22: 1985, 23: 1985, 24: 1985, 25: 1985, 26: 1985, 27: 1985, 28: 1985, 29: 1985, 30: 1986, }, } ).reset_index() result = ( df.set_index(["year", "PRuid", "QC"]) .reset_index() .reindex(columns=df.columns) ) tm.assert_frame_equal(result, df)
def QA_fetch_index_min(code, start, end, format='numpy', frequence='1min', collections=DATABASE.index_min): '获取股票分钟线' if frequence in ['1min', '1m']: frequence = '1min' elif frequence in ['5min', '5m']: frequence = '5min' elif frequence in ['15min', '15m']: frequence = '15min' elif frequence in ['30min', '30m']: frequence = '30min' elif frequence in ['60min', '60m']: frequence = '60min' __data = [] code = QA_util_code_tolist(code) cursor = collections.find( { 'code': { '$in': code }, "time_stamp": { "$gte": QA_util_time_stamp(start), "$lte": QA_util_time_stamp(end) }, 'type': frequence }, batch_size=10000) if format in ['dict', 'json']: return [data for data in cursor] for item in cursor: __data.append([ str(item['code']), float(item['open']), float(item['high']), float(item['low']), float(item['close']), int(item['up_count']), int(item['down_count']), float(item['vol']), float(item['amount']), item['datetime'], item['time_stamp'], item['date'] ]) __data = DataFrame(__data, columns=[ 'code', 'open', 'high', 'low', 'close', 'up_count', 'down_count', 'volume', 'amount', 'datetime', 'time_stamp', 'date' ]) __data['datetime'] = pd.to_datetime(__data['datetime']) __data = __data.set_index('datetime', drop=False) if format in ['numpy', 'np', 'n']: return numpy.asarray(__data) elif format in ['list', 'l', 'L']: return numpy.asarray(__data).tolist() elif format in ['P', 'p', 'pandas', 'pd']: return __data
def test_inplace_return_self(self): # GH 1893 data = DataFrame( {"a": ["foo", "bar", "baz", "qux"], "b": [0, 0, 1, 1], "c": [1, 2, 3, 4]} ) def _check_f(base, f): result = f(base) assert result is None # -----DataFrame----- # set_index f = lambda x: x.set_index("a", inplace=True) _check_f(data.copy(), f) # reset_index f = lambda x: x.reset_index(inplace=True) _check_f(data.set_index("a"), f) # drop_duplicates f = lambda x: x.drop_duplicates(inplace=True) _check_f(data.copy(), f) # sort f = lambda x: x.sort_values("b", inplace=True) _check_f(data.copy(), f) # sort_index f = lambda x: x.sort_index(inplace=True) _check_f(data.copy(), f) # fillna f = lambda x: x.fillna(0, inplace=True) _check_f(data.copy(), f) # replace f = lambda x: x.replace(1, 0, inplace=True) _check_f(data.copy(), f) # rename f = lambda x: x.rename({1: "foo"}, inplace=True) _check_f(data.copy(), f) # -----Series----- d = data.copy()["c"] # reset_index f = lambda x: x.reset_index(inplace=True, drop=True) _check_f(data.set_index("a")["c"], f) # fillna f = lambda x: x.fillna(0, inplace=True) _check_f(d.copy(), f) # replace f = lambda x: x.replace(1, 0, inplace=True) _check_f(d.copy(), f) # rename f = lambda x: x.rename({1: "foo"}, inplace=True) _check_f(d.copy(), f)
def get_a_weights_prop(dim_names, df_total: pd.DataFrame): return df_total.set_index(dim_names)["total"]
def test_int64_overflow_issues(self): # #2690, combinatorial explosion df1 = DataFrame(np.random.randn(1000, 7), columns=list('ABCDEF') + ['G1']) df2 = DataFrame(np.random.randn(1000, 7), columns=list('ABCDEF') + ['G2']) # it works! result = merge(df1, df2, how='outer') assert len(result) == 2000 low, high, n = -1 << 10, 1 << 10, 1 << 20 left = DataFrame(np.random.randint(low, high, (n, 7)), columns=list('ABCDEFG')) left['left'] = left.sum(axis=1) # one-2-one match i = np.random.permutation(len(left)) right = left.iloc[i].copy() right.columns = right.columns[:-1].tolist() + ['right'] right.index = np.arange(len(right)) right['right'] *= -1 out = merge(left, right, how='outer') assert len(out) == len(left) assert_series_equal(out['left'], -out['right'], check_names=False) result = out.iloc[:, :-2].sum(axis=1) assert_series_equal(out['left'], result, check_names=False) assert result.name is None out.sort_values(out.columns.tolist(), inplace=True) out.index = np.arange(len(out)) for how in ['left', 'right', 'outer', 'inner']: assert_frame_equal(out, merge(left, right, how=how, sort=True)) # check that left merge w/ sort=False maintains left frame order out = merge(left, right, how='left', sort=False) assert_frame_equal(left, out[left.columns.tolist()]) out = merge(right, left, how='left', sort=False) assert_frame_equal(right, out[right.columns.tolist()]) # one-2-many/none match n = 1 << 11 left = DataFrame(np.random.randint(low, high, (n, 7)).astype('int64'), columns=list('ABCDEFG')) # confirm that this is checking what it is supposed to check shape = left.apply(Series.nunique).values assert is_int64_overflow_possible(shape) # add duplicates to left frame left = concat([left, left], ignore_index=True) right = DataFrame(np.random.randint(low, high, (n // 2, 7)).astype('int64'), columns=list('ABCDEFG')) # add duplicates & overlap with left to the right frame i = np.random.choice(len(left), n) right = concat([right, right, left.iloc[i]], ignore_index=True) left['left'] = np.random.randn(len(left)) right['right'] = np.random.randn(len(right)) # shuffle left & right frames i = np.random.permutation(len(left)) left = left.iloc[i].copy() left.index = np.arange(len(left)) i = np.random.permutation(len(right)) right = right.iloc[i].copy() right.index = np.arange(len(right)) # manually compute outer merge ldict, rdict = defaultdict(list), defaultdict(list) for idx, row in left.set_index(list('ABCDEFG')).iterrows(): ldict[idx].append(row['left']) for idx, row in right.set_index(list('ABCDEFG')).iterrows(): rdict[idx].append(row['right']) vals = [] for k, lval in ldict.items(): rval = rdict.get(k, [np.nan]) for lv, rv in product(lval, rval): vals.append(k + tuple([lv, rv])) for k, rval in rdict.items(): if k not in ldict: for rv in rval: vals.append(k + tuple([np.nan, rv])) def align(df): df = df.sort_values(df.columns.tolist()) df.index = np.arange(len(df)) return df def verify_order(df): kcols = list('ABCDEFG') assert_frame_equal(df[kcols].copy(), df[kcols].sort_values(kcols, kind='mergesort')) out = DataFrame(vals, columns=list('ABCDEFG') + ['left', 'right']) out = align(out) jmask = { 'left': out['left'].notna(), 'right': out['right'].notna(), 'inner': out['left'].notna() & out['right'].notna(), 'outer': np.ones(len(out), dtype='bool') } for how in 'left', 'right', 'outer', 'inner': mask = jmask[how] frame = align(out[mask].copy()) assert mask.all() ^ mask.any() or how == 'outer' for sort in [False, True]: res = merge(left, right, how=how, sort=sort) if sort: verify_order(res) # as in GH9092 dtypes break with outer/right join assert_frame_equal(frame, align(res), check_dtype=how not in ('right', 'outer'))
def test_interp_nan_idx(self): df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [np.nan, 2, 3, 4]}) df = df.set_index('A') with pytest.raises(NotImplementedError): df.interpolate(method='values')
def test_multiindex_slicers_edges(self): # GH 8132 # various edge cases df = DataFrame({ "A": ["A0"] * 5 + ["A1"] * 5 + ["A2"] * 5, "B": ["B0", "B0", "B1", "B1", "B2"] * 3, "DATE": [ "2013-06-11", "2013-07-02", "2013-07-09", "2013-07-30", "2013-08-06", "2013-06-11", "2013-07-02", "2013-07-09", "2013-07-30", "2013-08-06", "2013-09-03", "2013-10-01", "2013-07-09", "2013-08-06", "2013-09-03", ], "VALUES": [22, 35, 14, 9, 4, 40, 18, 4, 2, 5, 1, 2, 3, 4, 2], }) df["DATE"] = pd.to_datetime(df["DATE"]) df1 = df.set_index(["A", "B", "DATE"]) df1 = df1.sort_index() # A1 - Get all values under "A0" and "A1" result = df1.loc[(slice("A1")), :] expected = df1.iloc[0:10] tm.assert_frame_equal(result, expected) # A2 - Get all values from the start to "A2" result = df1.loc[(slice("A2")), :] expected = df1 tm.assert_frame_equal(result, expected) # A3 - Get all values under "B1" or "B2" result = df1.loc[(slice(None), slice("B1", "B2")), :] expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13, 14]] tm.assert_frame_equal(result, expected) # A4 - Get all values between 2013-07-02 and 2013-07-09 result = df1.loc[(slice(None), slice(None), slice("20130702", "20130709")), :] expected = df1.iloc[[1, 2, 6, 7, 12]] tm.assert_frame_equal(result, expected) # B1 - Get all values in B0 that are also under A0, A1 and A2 result = df1.loc[(slice("A2"), slice("B0")), :] expected = df1.iloc[[0, 1, 5, 6, 10, 11]] tm.assert_frame_equal(result, expected) # B2 - Get all values in B0, B1 and B2 (similar to what #2 is doing for # the As) result = df1.loc[(slice(None), slice("B2")), :] expected = df1 tm.assert_frame_equal(result, expected) # B3 - Get all values from B1 to B2 and up to 2013-08-06 result = df1.loc[(slice(None), slice("B1", "B2"), slice("2013-08-06")), :] expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13]] tm.assert_frame_equal(result, expected) # B4 - Same as A4 but the start of the date slice is not a key. # shows indexing on a partial selection slice result = df1.loc[(slice(None), slice(None), slice("20130701", "20130709")), :] expected = df1.iloc[[1, 2, 6, 7, 12]] tm.assert_frame_equal(result, expected)
from public.models import Metabolite, Reaction from public.models import MetaboliteMapCoordinate, ReactionMapCoordinate #------------------------ # Metabolite positions #------------------------ m_df = DataFrame(columns=('id', 'model_type', 'wid', 'name', 'compartment', 'x', 'y')) m_cor = MetaboliteMapCoordinate.objects.all() for k, p in enumerate(m_cor): # get the associated metabolite (should be exactly 1) m = p.metabolites.all()[0] # add metabolite and position m_df.loc[k] = (m.id, m.model_type, m.wid, m.name, p.compartment, p.x, p.y) m_df = m_df.set_index(m_df.id) # set the data types m_df[['id', 'x', 'y']] = m_df[['id', 'x', 'y']].astype(int) m_df.head(10) #------------------------ # Reaction positions #------------------------ r_df = DataFrame(columns=('id', 'model_type', 'wid', 'name', 'path', 'value_x', 'value_y', 'label_x', 'label_y')) r_cor = ReactionMapCoordinate.objects.all() for k, p in enumerate(r_cor): # get reaction r = p.reactions.all()[0] r_df.loc[k] = (r.id, r.model_type, r.wid, r.name, p.path, p.value_x,
print(g.transform(normalize)) print(g.apply(normalize)) print(g.transform('mean')) normalized = (df['value'] - g.transform('mean')) / g.transform('std') print(normalized) # 12.2.2分组的时间重新采样 N = 15 times = pd.date_range('2017-05-20 00:00', freq='1min', periods=N) df = DataFrame({'time': times, 'value': np.arange(N)}) print(df) print(df.set_index('time').resample('5min').count()) df2 = DataFrame({ 'time': times.repeat(3), 'key': np.tile(['a', 'b', 'c'], N), 'value': np.arange(N * 3.) }) print(df2) time_key = pd.TimeGrouper('5min') resampled = (df2.set_index('time').groupby(['key', time_key]).sum()) print(resampled) print(resampled.reset_index()) # 12.3方法链技术 # 12.3.1pipe方法