def generate_row_hash(d: DataFrame, hash_only=False, date=None) -> DataFrame: """ Parameters ---------- d hash_only date Returns ------- """ hash_cols = [ "date", "area_type", "area_code", "metric_id", "release_id" ] try: d.date = d.date.map(lambda x: x.strftime("%Y-%m-%d")) except AttributeError: pass d.date = d.date.map(lambda x: x[:10]) # Create hash hash_key = ( d .loc[:, hash_cols] .astype(str) .sum(axis=1) .apply(str.encode) .apply(lambda x: blake2s(x, key=RECORD_KEY, digest_size=12).hexdigest()) ) if hash_only: return hash_key column_names = d.columns data = d.assign( hash=hash_key, seriesDate=date, id=hash_key ).loc[:, ['id', 'hash', 'seriesDate', *list(column_names)]] return data
def compute_distribution(data): pre_outstanding = 0 open_price = data.at[0, 'open'] data = data[['date', 'volume', 'aprice', 'outstanding']] data.date = data.date.str.encode("UTF-8") np_data = data.values tmp_arrary = np.zeros((2, 6), dtype=DTYPE_LIST) data_arrary = np.zeros((2, 6), dtype=DTYPE_LIST) for index, row in enumerate(np_data): cdate, volume, aprice, outstanding = row[[0, 1, 2, 3]] if 0 == index: t1 = (index, cdate, cdate, aprice, volume, outstanding) t2 = (index, cdate, cdate, open_price, outstanding - volume, outstanding) t = np.array([t1, t2], dtype=DTYPE_LIST) tmp_arrary = t.copy() else: tmp_arrary = adjust_volume(tmp_arrary, index, volume, aprice, pre_outstanding, outstanding) tmp_arrary['date'] = cdate tmp_arrary['outstanding'] = outstanding tdata = (index, cdate, cdate, aprice, volume, outstanding) t = np.array([tdata], dtype=DTYPE_LIST) tmp_arrary = np.concatenate((tmp_arrary, np.array(t)), axis=0) pre_outstanding = outstanding tmp_arrary = tmp_arrary[tmp_arrary['volume'] > 0] data_arrary = tmp_arrary.copy() if 0 == index else np.concatenate( (data_arrary, tmp_arrary), axis=0) df = DataFrame(data=data_arrary, columns=CHIP_COLUMNS) df.date = df.date.str.decode('utf-8') df.sdate = df.sdate.str.decode('utf-8') df.price = df.price.astype(float).round(2) return df
def prep_activity_data(df: pd.DataFrame) -> pd.DataFrame: log('Prepping activity data frame') clean_column_names(df) df.date = pd.to_datetime(df.date) # Columns that should actually be numbers, but that could have commas in # them, so pandas treats them as objects (strings) number_cols = [ 'calories_burned', 'steps', 'minutes_sedentary', 'minutes_lightly_active', 'minutes_fairly_active', 'minutes_very_active', 'activity_calories' ] # We'll `select_dtypes` here so that if a column is already numeric, we # won't try to re-process it. for col in df[number_cols].select_dtypes('object'): df[col] = handle_commas(df[col]) # We'll do a little bit of feature engineering here and combine the seperate # active minutes columns into a single column reprepseting overall active # minutes. df['minutes_active'] = (df.minutes_lightly_active + df.minutes_fairly_active + df.minutes_very_active) df.drop([ 'minutes_lightly_active', 'minutes_fairly_active', 'minutes_very_active' ], axis=1, inplace=True) df['month'] = df.date.dt.strftime('%m-%b') df['weekday'] = df.date.dt.day_name().str[:3] return df.sort_values(by='date')
def finance_report(start: Timestamp, end: Timestamp, market: str, symbol: str, report_type: str, quarter="all") -> DataFrame: """ :param start: start time :param end: end time :param market: {'HK', 'CN'} :param symbol: stock symbol :param report_type: {'indicator', 'balance', 'income', 'business'} :param quarter: {'all', 'Q1', 'Q2', ‘Q3', 'Q4'} :return: data frame contains items of financial report """ count = (end.to_period(freq='Q') - start.to_period(freq='Q')).n end_timestamp = int(end.timestamp() * 1000) urlpath = f"{market}/{report_type}.json?symbol={symbol}&&type={quarter}" \ f"&is_detail=true&count={count}×tamp={end_timestamp}" url = urljoin(api_ref.finance_base, urlpath) data = utls.fetch(url) data_list = data.pop('list') for d in data_list: for k in d: if isinstance(d[k], list): d[k] = d[k][0] df = DataFrame(data_list).drop(columns=['ctime']).rename( columns={ 'report_date': 'date' }).set_index('date') df.date = df.date.astype('M8[ms]') df.report_name = df.report_name.str.replace('年报', 'Q4').str.replace('三季报', 'Q3')\ .str.replace('中报', 'Q2').str.replace('一季报', 'Q1') return df
def process_df(df: pd.DataFrame, basin_indexes: List[Tuple], loop_idx: int) -> pd.DataFrame: """Take in a dataframe and process it before converting to array.""" df.rename(columns={"discharge_spec": "QObs(mm/d)"}, inplace=True) df.date = pd.to_datetime(df.date, dayfirst=True, format="%Y-%m-%d") # Iterate through each category of static basin attributes and add the ones in the config yaml as a column. for key in DATASET_KEYS[1:]: # Check if any of the features requested are actually in this category, doing this gives large speedup. if len(self.features[key]) > 0: filename = f'CAMELS_GB_{key}_attributes.csv' attr_df: pd.DataFrame = pd.read_csv(os.path.join(self.data_dir, filename), usecols=['gauge_id'] + list(self.features[key]), index_col='gauge_id') for name, row in attr_df.loc[basin][self.features[key]].iteritems(): if name == 'dom_land_cover': # Label encoding is needed for only this attribute (in the landcover data). dom_land_cover_dict = {"Grass and Pasture": 0, "Shrubs": 1, "Crops": 2, "Urban": 3, "Deciduous Woodland": 4, "Evergreen Woodland": 5} row = dom_land_cover_dict[row] df[name] = row # Crop the date range as much as possible. if len(self.dates) == 0 and self.train: self.dates = [df.date[0], self.train_test_split] elif len(self.dates) == 0 and not self.train: self.dates = [self.train_test_split, df.date.iloc[-1]] df = self._crop_dates(df, start_date=self.dates[0], end_date=self.dates[1]) # Remove as many contiguous regions of NaNs as possible. df = self._remove_nan_regions(df) # basin_indexes is a list of tuples containing the start and end indexes for each basin, # in the form (start_idx, end_idx). if loop_idx == 0: basin_indexes.append((0, len(df))) else: basin_indexes.append((basin_indexes[-1][1], basin_indexes[-1][1] + len(df))) return df
def from_dataframe(cls, df: pd.DataFrame) -> CommitDataFrame: if (len(df) == 0): return cls.DF_NULL df.date = pd.to_datetime(df.date) df.set_index("date", inplace=True) df.sort_index() return cls.up(df)
def test_deferred_with_groupby(): # GH 12486 # support deferred resample ops with groupby data = [['2010-01-01', 'A', 2], ['2010-01-02', 'A', 3], ['2010-01-05', 'A', 8], ['2010-01-10', 'A', 7], ['2010-01-13', 'A', 3], ['2010-01-01', 'B', 5], ['2010-01-03', 'B', 2], ['2010-01-04', 'B', 1], ['2010-01-11', 'B', 7], ['2010-01-14', 'B', 3]] df = DataFrame(data, columns=['date', 'id', 'score']) df.date = pd.to_datetime(df.date) def f(x): return x.set_index('date').resample('D').asfreq() expected = df.groupby('id').apply(f) result = df.set_index('date').groupby('id').resample('D').asfreq() assert_frame_equal(result, expected) df = DataFrame({ 'date': pd.date_range(start='2016-01-01', periods=4, freq='W'), 'group': [1, 1, 2, 2], 'val': [5, 6, 7, 8] }).set_index('date') def f(x): return x.resample('1D').ffill() expected = df.groupby('group').apply(f) result = df.groupby('group').resample('1D').ffill() assert_frame_equal(result, expected)
def test_deferred_with_groupby(): # GH 12486 # support deferred resample ops with groupby data = [['2010-01-01', 'A', 2], ['2010-01-02', 'A', 3], ['2010-01-05', 'A', 8], ['2010-01-10', 'A', 7], ['2010-01-13', 'A', 3], ['2010-01-01', 'B', 5], ['2010-01-03', 'B', 2], ['2010-01-04', 'B', 1], ['2010-01-11', 'B', 7], ['2010-01-14', 'B', 3]] df = DataFrame(data, columns=['date', 'id', 'score']) df.date = pd.to_datetime(df.date) def f(x): return x.set_index('date').resample('D').asfreq() expected = df.groupby('id').apply(f) result = df.set_index('date').groupby('id').resample('D').asfreq() assert_frame_equal(result, expected) df = DataFrame({'date': pd.date_range(start='2016-01-01', periods=4, freq='W'), 'group': [1, 1, 2, 2], 'val': [5, 6, 7, 8]}).set_index('date') def f(x): return x.resample('1D').ffill() expected = df.groupby('group').apply(f) result = df.groupby('group').resample('1D').ffill() assert_frame_equal(result, expected)
def long_per_task(self): # a = self.b.get_chart_data(start_dt=datetime.now() - timedelta(days=4)) a = self.b.get_chart_data(self.symbol) df = DataFrame(a) df.date = df.date.apply(datetime.fromtimestamp) self.get_degree(df, self.period_length, self.degree_level) self.get_targets(df, self.period_length, self.trace_level, self.target_rm_length) nop()
def normalise_records(d: DataFrame, zero_filled: Iterable[str] = tuple(), cumulative: Iterable[str] = tuple(), reset_index: bool = False) -> DataFrame: """ Parameters ---------- d zero_filled cumulative reset_index Returns ------- """ zero_filled = set(zero_filled).intersection(d.columns) cumulative = set(cumulative).intersection(d.columns) if not reset_index: d.sort_values(["areaType", "areaCode", "date"], inplace=True) else: d = (d.reset_index().sort_values(["areaType", "areaCode", "date"])) for col in zero_filled: for areaCode in unique(d.areaCode): dm = d.loc[d.areaCode == areaCode, [col, 'date']] indices = ((d.areaCode == areaCode) & (d.date < dm.dropna(axis=0).date.max()) & (d.date >= dm.dropna(axis=0).date.min())) d.loc[indices, col] = d.loc[indices, col].fillna(0) # Area names are scattered around - we cannot use # normal `fillna` to fill them. if "areaName" in d.columns: for areaCode in unique(d.areaCode): area_name = unique(d.loc[d.areaCode == areaCode, "areaName"].dropna().values)[0] d.loc[d.areaCode == areaCode, 'areaName'] = area_name for col in cumulative: for areaCode in unique(d.areaCode): dm = d.loc[d.areaCode == areaCode, [col, 'date']] indices = ((d.areaCode == areaCode) & (d.date < dm.dropna(axis=0).date.max()) & (d.date >= dm.dropna(axis=0).date.min())) d.loc[indices, col] = d.loc[indices, col].fillna(method="ffill") d.date = d.date.map(lambda x: x.strftime("%Y-%m-%d")) if "areaName" in d.columns: d = d.assign(areaNameLower=d.areaName.str.lower()) return d
def normalise_demographics_records( d: DataFrame, zero_filled: Iterable[str] = tuple(), cumulative: Iterable[str] = tuple() ) -> DataFrame: """ Parameters ---------- d zero_filled cumulative Returns ------- """ zero_filled = set(zero_filled).intersection(d.columns) cumulative = set(cumulative).intersection(d.columns) d = d.reset_index().sort_values(["areaType", "areaCode", "date", "age"]) d.loc[:, zero_filled] = (d.loc[:, zero_filled].where( d.loc[:, zero_filled].notnull(), 0)) # Area names are scattered around - we cannot use # normal `fillna` to fill them. if "areaName" in d.columns: for areaCode in d.areaCode.dropna().unique(): area_name = unique(d.loc[d.areaCode == areaCode, "areaName"].dropna().values)[0] d.loc[d.areaCode == areaCode, 'areaName'] = area_name # All cumulative metrics should have the same starting # point across different age bands. d.loc[d.date == d.date.min(), cumulative] = (d.loc[d.date == d.date.min(), cumulative].where( d.loc[d.date == d.date.min(), cumulative].notnull(), 0)) for col, areaCode, age in product(cumulative, d.areaCode.unique(), d.age.unique()): dm = d.loc[((d.areaCode == areaCode) & (d.age == age)), [col, 'date']] indices = ((d.areaCode == areaCode) & (d.age == age) & (d.date < dm.dropna(axis=0).date.max()) & (d.date >= dm.dropna(axis=0).date.min())) d.loc[indices, col] = d.loc[indices, col].fillna(method="ffill") d.date = d.date.map(lambda x: x.strftime("%Y-%m-%d")) if "areaName" in d.columns: d = d.assign(areaNameLower=d.areaName.str.lower()) return d
def add_columns(df: pd.DataFrame): # Format date. df.date = pd.to_datetime(df.date, format='%Y%m%d') # Set the date as the DataFrame's index. df = df.set_index('date') # Add date-derived columns. df['date'] = df.index.date df['year'] = df.index.year df['month'] = df.index.month df['week'] = df.index.week df['dow'] = df.index.day_name() df['dowIndex'] = df.index.dayofweek # Add group-summarization columns. df_weekly = df.groupby('week', as_index=False)['posIncrease'].agg( { 'weeklyPosIncrease': 'sum', 'meanWeeklyPosIncrease': 'mean', 'stdWeeklyPosIncrease': 'std', }, ) df = pd.merge( df, df_weekly, how='left', on='week', ) df['pctWeeklyPosIncrease'] = percent(df.posIncrease, df.weeklyPosIncrease) df['zscoreWeeklyPosIncrease'] = zScore( df.posIncrease, df.meanWeeklyPosIncrease, df.stdWeeklyPosIncrease, ) # Add delta columns. df['day1LagDelta'] = lag_delta(df.posIncrease, 1) df['day1LeadDelta'] = lead_delta(df.posIncrease, 1) # Add local extrema columns. df['localMaximum'] = df.apply(local_max, axis=1) df['localMinimum'] = df.apply(local_min, axis=1) # Save a copy of the processed data. df.to_csv( f'{DATA_DIR}/02_intermediate/{DAILY}_add_columns.csv', index=True, ) # Debug data frame. DEBUG and preview(df, add_columns.__name__) # Return data frame for reuse. return df
def homogenise_demographics_dates(d: DataFrame): """ Parameters ---------- d Returns ------- """ d.date = to_datetime(d.date, format="%Y-%m-%d") col_names = d.columns date = date_range( start=to_datetime(d.date).min(), end=to_datetime(d.date).max() ) dt_time_list = list() age = d.age.unique() for area_type in unique(d.areaType): values = product( [area_type], unique(d.loc[d.areaType == area_type, "areaCode"]), date, age ) d_date = DataFrame( columns=["value"], index=MultiIndex.from_tuples( tuples=list(values), names=["areaType", "areaCode", "date", "age"] ) ) dt_time_list.append(d_date) dt_time = concat(dt_time_list) dt_time.reset_index(inplace=True) d = d.merge(dt_time, how='outer', on=['areaType', 'areaCode', 'date', 'age']) d.sort_values( ["date", "areaType", "areaCode", "age"], ascending=[True, True, False, True], inplace=True ) return d.loc[:, col_names]
def base_floating_profit(df, mdate=None): s_index = 0 np_data = df.to_records(index=False) np_data = np_data.astype(DTYPE_LIST) index_array = np.arange(len(np_data)) ppchange_array = np.zeros(len(np_data), dtype=float) if mdate is None: get_breakup_data(np_data) break_index_lists = np.where(np_data['breakup'] != 0)[0] effective_breakup_index_list = get_effective_breakup_index( break_index_lists, np_data) np_data['pday'] = 1 np_data['base'] = np_data['close'].copy() if len(effective_breakup_index_list) == 0: np_data['profit'] = (np_data['close'] - np_data['uprice']) / np_data['uprice'] else: for e_index in effective_breakup_index_list: if s_index == e_index: if len(effective_breakup_index_list) == 1: base = np_data['uprice'][s_index] direction = np_data['breakup'][s_index] ppchange = 1.1 if direction > 0 else 0.9 np_data['base'][s_index] = base ppchange_array[s_index:] = ppchange np_data['pday'][s_index:] = direction * ( index_array[s_index:] - s_index + 1) else: base = np_data['uprice'][s_index] direction = np_data['breakup'][e_index] ppchange = 1.1 if direction < 0 else 0.9 np_data['base'][s_index:e_index] = base ppchange_array[s_index:e_index] = ppchange np_data['pday'][s_index:e_index] = -1 * direction * ( index_array[s_index:e_index] - s_index + 1) s_index = e_index if e_index == effective_breakup_index_list[-1]: base = np_data['uprice'][e_index] direction = np_data['breakup'][e_index] ppchange = 1.1 if direction > 0 else 0.9 np_data['base'][e_index:] = base ppchange_array[e_index:] = ppchange np_data['pday'][e_index:] = direction * ( index_array[e_index:] - e_index + 1) np_data['profit'] = abs( np.log(np_data['close']) - np.log(np_data['base'])) / np.log(ppchange_array) df = DataFrame(data=np_data, columns=DATA_COLUMS) df.date = df.date.str.decode('utf-8') return df
def _sanitize(df: pd.DataFrame) -> pd.DataFrame: """ Method used to sanitize the dataframe. It should normalize the dataframe data In this example I just normalize the date format (It's true that I can do this in the read method) but we can imagine an index validation/transformation or invalid characters handling. All those manipulation depends on the need. :param df: Data frame to sanitize :return: Sanitized Dataframe """ if "date" in df.columns: df.date = pd.to_datetime(df.date, infer_datetime_format=True, cache=True).dt.strftime("%Y-%m-%d") return df
def _parse_boro(data: DataFrame, column_prefix: str, fips: str) -> DataFrame: data = table_rename( data, { "DATE_OF_INTEREST": "date", f"{column_prefix}_CASE_COUNT": "new_confirmed", f"{column_prefix}_HOSPITALIZED_COUNT": "new_hospitalized", f"{column_prefix}_DEATH_COUNT": "new_deceased", }, drop=True, ) data.date = data.date.apply(lambda x: datetime_isoformat(x, "%m/%d/%Y")) data["key"] = f"US_NY_{fips}" return data
def process_db_results(self, results: DataFrame) -> bytes: try: results.date = results.date.map(lambda x: f"{x:%Y-%m-%d}") except ValueError: pass res = (results.rename( columns={ "areaType": "area_type", "areaName": "area_name", "areaCode": "area_code" }).to_json(orient="records").encode()) return res
def apply_charting_to_df( df: pd.DataFrame, chart_period: str, start_time: str, stop_time: str ): """Modifies the dataframe based on the chart_period, start dates and end dates Parameters ---------- df: dataframe with data loaded chart_period: string, describes how often to sample data, default is '1Min' (1 minute) see https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects start_time: datestring in YYYY-MM-DD HH:MM (ex. 2020-08-31 04:00) of when to begin the backtest stop_time: datestring of YYYY-MM-DD HH:MM when to stop the backtest Returns DataFrame, a sorted dataframe ready for consumption by run_backtest """ if df.index.dtype != "datetime64[ns]": headers = df.columns.values.tolist() headers.extend([df.index.name]) if "date" not in headers: raise Exception( "Data does not have a date column. Headers must include date, open, high, low, close, volume." ) time_unit = detect_time_unit(df.date[1]) df.date = pd.to_datetime(df.date, unit=time_unit) df.set_index("date", inplace=True) if start_time: if isinstance(start_time, datetime) or type(start_time) is int: time_unit = detect_time_unit(start_time) start_time = pd.to_datetime(start_time, unit=time_unit) start_time = start_time.strftime("%Y-%m-%d %H:%M:%S") if stop_time: if isinstance(stop_time, datetime) or type(stop_time) is int: time_unit = detect_time_unit(stop_time) stop_time = pd.to_datetime(stop_time, unit=time_unit) stop_time = stop_time.strftime("%Y-%m-%d %H:%M:%S") df = df.resample(chart_period).first() if start_time and stop_time: df = df[start_time:stop_time] # noqa elif start_time and not stop_time: df = df[start_time:] # noqa elif not start_time and stop_time: df = df[:stop_time] return df
def compute_oneday_distribution(pre_date_dist, cdate, pos, volume, aprice, pre_outstanding, outstanding): np_pre_data = pre_date_dist.to_records(index=False) np_pre_data = np_pre_data.astype(DTYPE_LIST) np_pre_data = adjust_volume(np_pre_data, pos, volume, aprice, pre_outstanding, outstanding) np_pre_data['date'] = cdate np_pre_data['outstanding'] = outstanding np_pre_data = np.concatenate( (np_pre_data, np.array([(pos, cdate, cdate, aprice, volume, outstanding)], dtype=DTYPE_LIST)), axis=0) df = DataFrame(data=np_pre_data, columns=CHIP_COLUMNS) df = df[df.volume != 0] df.date = df.date.str.decode('utf-8') df.sdate = df.sdate.str.decode('utf-8') df.price = df.price.astype(float).round(2) return df.reset_index(drop=True)
def magic_predict(data: pd.DataFrame, steps: List[PreprocessingStep], params: List[Dict[str, Any]], model: Model, evaluation: bool = False) -> pd.DataFrame: alphas = [1.028, 1.023, 1.018] weights = [1 / len(alphas)] * len(alphas) sub = 0. fday = datetime(2016, 4, 25) useless_cols = ["id", "date", "demand", "d", "wm_yr_wk"] max_lags = 57 cols = [f"F{i}" for i in range(1, 29)] data.date = pd.to_datetime(data.date) pred_range = range(28, 56) if evaluation else range(0, 28) for icount, (alpha, weight) in enumerate(zip(alphas, weights)): for tdelta in pred_range: reset_dataframe_pivot_cache(data) _process_and_predict(data, model, steps, params, fday, tdelta, max_lags, useless_cols, alpha=alpha) te_sub = _to_submission_format(data) if icount == 0: sub = te_sub sub[cols] *= weight else: sub[cols] += te_sub[cols] * weight print(icount, alpha, weight) return sub
def test_deferred_with_groupby(): # GH 12486 # support deferred resample ops with groupby data = [ ["2010-01-01", "A", 2], ["2010-01-02", "A", 3], ["2010-01-05", "A", 8], ["2010-01-10", "A", 7], ["2010-01-13", "A", 3], ["2010-01-01", "B", 5], ["2010-01-03", "B", 2], ["2010-01-04", "B", 1], ["2010-01-11", "B", 7], ["2010-01-14", "B", 3], ] df = DataFrame(data, columns=["date", "id", "score"]) df.date = pd.to_datetime(df.date) def f(x): return x.set_index("date").resample("D").asfreq() expected = df.groupby("id").apply(f) result = df.set_index("date").groupby("id").resample("D").asfreq() tm.assert_frame_equal(result, expected) df = DataFrame( { "date": pd.date_range(start="2016-01-01", periods=4, freq="W"), "group": [1, 1, 2, 2], "val": [5, 6, 7, 8], } ).set_index("date") def f(x): return x.resample("1D").ffill() expected = df.groupby("group").apply(f) result = df.groupby("group").resample("1D").ffill() tm.assert_frame_equal(result, expected)
def regular_predict(data: pd.DataFrame, steps: List[PreprocessingStep], params: List[Dict[str, Any]], model: Model, evaluation: bool = False) -> pd.DataFrame: fday = datetime(2016, 4, 25) useless_cols = ["id", "date", "demand", "d", "wm_yr_wk"] max_lags = 57 data.date = pd.to_datetime(data.date) pred_range = range(28, 56) if evaluation else range(0, 28) for tdelta in pred_range: reset_dataframe_pivot_cache(data) _process_and_predict(data, model, steps, params, fday, tdelta, max_lags, useless_cols) sub = _to_submission_format(data) return sub
def nyt_county_normalize(df: pd.DataFrame) -> pd.DataFrame: df = df.rename(columns={'cases': 'confirmed'}) # Change their dates to the date format we use # They use YYYY-MM-DD # We use M/D/YYYY def convert_nyt_date(date): old = datetime.strptime(date, '%Y-%m-%d') new = old.strftime('%-m/%-d/%y') return new df.date = df.date.map(convert_nyt_date) def split_nyt_data(df: pd.DataFrame) -> pd.DataFrame: deaths = df.loc[:, df.columns != 'cases'] confirmed = df.loc[:, df.columns != 'deaths'] return confirmed, deaths # Need to turn date column into a bunch of different columns for each state/county dates = list(set(df.date)) dates.sort(key=lambda d: datetime.strptime(d, '%m/%d/%y')) # Split into two dataframes for cases and deaths confirmed, deaths = split_nyt_data(df) # Do the following for each of recovered and deaths def transpose_nyt_data(df: pd.DataFrame, expand: str) -> pd.DataFrame: join_on = ['county', 'state', 'fips'] state_county = set(map(tuple, df[join_on].values)) t = pd.DataFrame(state_county, columns=join_on) for date in dates: s = df[df.date == date][join_on + [expand]] s = s.rename(columns={expand: date}) t = t.merge(s, on=join_on, how='left') return t confirmed = transpose_nyt_data(confirmed, expand='confirmed') deaths = transpose_nyt_data(deaths, expand='deaths') # make fips an str instead of a float confirmed = confirmed.astype({'fips': 'object'}) deaths = deaths.astype({'fips': 'object'}) confirmed.fips = confirmed.fips.astype('Int64').astype(str).str.zfill(5) deaths.fips = deaths.fips.astype('Int64').astype(str).str.zfill(5) table = pandemics.fetch.county_table() def geocode_nyt(df): df = pd.merge(df, table, how='left', on='fips') cols = df.columns.tolist() cols = cols[:3] + cols[-2:] + cols[3:-2] df = df[cols] date_cols = [col for col in df.columns if '/' in col] date_retype = {d: 'Int64' for d in date_cols} df = df.astype(date_retype) return df confirmed = geocode_nyt(confirmed) deaths = geocode_nyt(deaths) return confirmed, deaths
def df_cleaner(df: pd.DataFrame) -> pd.DataFrame: # Convert to date df.date = pd.to_datetime(df.date).dt.date return df.replace({np.nan: None})
def get_plots(data, plots): # type: (List[dict], List[dict]) -> List[dcc.Graph] ''' Gets a Dash plots using given dicts. Assumes dict element has all columns of table as keys. Args: data (list[dict]): List of dicts defining data. plots (list[dict]): List of dicts defining plots. Raises: EnforceError: If data is not a list of dicts. EnforceError: If plots is not a list of dicts. Returns: list[dcc.Graph]: Plots. ''' msg = 'Data must be a list of dictionaries. Given value: {a}.' Enforce(data, 'instance of', list, message=msg) for item in data: Enforce(item, 'instance of', dict, message=msg) msg = 'Plots must be a list of dictionaries. Given value: {a}.' Enforce(plots, 'instance of', list, message=msg) for item in plots: Enforce(item, 'instance of', dict, message=msg) # -------------------------------------------------------------------------- data_ = DataFrame(data) if 'date' in data_.columns: data_.date = DatetimeIndex(data_.date) elems = [] for i, x in enumerate(plots): plot = cfg.PlotItem(x) plot.validate() plot = plot.to_primitive() min_width = str(plot['min_width']) + '%' try: fig = sdt.get_figure( data_, filters=plot['filters'], group=plot['group'], pivot=plot['pivot'], **plot['figure'], ) fig = dcc.Graph( id=f'plot-{i:02d}', className='plot', figure=fig, style={'min-width': min_width}, ) except (DataError, EnforceError): fig = html.Div( id=f'plot-{i:02d}', className='plot plot-error', style={'min-width': min_width}, children=html.Div( className='plot-error-container', children=html.Div( className='plot-error-message', children='no data found' ) ) ) elems.append(fig) return elems
entry.kd = entry.kills / (entry.deaths if entry.deaths > 0 else 1) flattened.append(entry) p = figure(plot_width=1400, plot_height=600, x_axis_type='datetime', title="Kills per Match (avg)") colors = color_gen() sorted_by_name = sorted(flattened, key=lambda x: x.name) for key, scores in groupby(sorted_by_name, key=lambda x: x.name): scores = [score.toDict() for score in scores] if len(scores) > 50: df = DataFrame(data=scores) df.date = to_datetime(df.date, format='%Y-%m-%d %H:%M:%S %Z') df.set_index('date', inplace=True) df = df.resample('1d').mean() df = df.rolling('30d').mean() df = df.interpolate() source = ColumnDataSource(df) p.line(x='date', y='kills', legend=key, source=source, color=next(colors), line_width=4) show(p)
def prep_food_data(df: pd.DataFrame) -> pd.DataFrame: log('Prepping food data frame') clean_column_names(df) df.date = pd.to_datetime(df.date) df.calories_in = handle_commas(df.calories_in) return df.sort_values(by='date')
def score_history(self, customer_interactions: DataFrame) -> DataFrame: """ This method contain the logic to calculate brand gender score from a list of CustomerInteraction - brands purchased and added to the wish list are scored with specific weight - time decay is applied to the score - data is group by customer_id and score is sum It should receive as input a DataFrame with the following structure: ['product_id', 'date', 'brand_id', 'gender', 'views', 'purchased', 'add_to_cart', 'add_to_wishlist', 'time_on_page'] :return: DataFrame: ['memberID', 'b_g', 'total_hits'] """ if customer_interactions.size < 1: raise Exception('Can not score empty user interactions.') # Combine brand and gender into one column customer_interactions['b_g'] = customer_interactions \ .apply(lambda x: str(x.brand_id) + ' ' + str(x.gender), axis=1) # ==================== Weight users-items interactions ============== # # set views value for product purchased (purchased==1) customer_interactions.loc[customer_interactions.purchased == 1, 'views'] = \ self._config['p_weight'] * customer_interactions[customer_interactions.purchased == 1]['views'] # set views value for product added to wishlist or cart but not purchased (purchased!=1) customer_interactions.loc[((customer_interactions.add_to_wishlist == 1) | (customer_interactions.add_to_cart == 1)) & (customer_interactions.purchased != 1), 'views'] = \ self._config['w_weight'] * customer_interactions[((customer_interactions.add_to_wishlist == 1) | (customer_interactions.add_to_cart == 1)) & (customer_interactions.purchased != 1)]['views'] # ==================== Apply time decay function ==================== # # Convert string date to datetime customer_interactions.date = pd.to_datetime(customer_interactions.date) # add a new column decay_date on the data frame with the decay function last_browsing_date = pd.to_datetime('now') customer_interactions = customer_interactions.assign( decay_date=lambda x: (last_browsing_date - x.date).astype( 'timedelta64[D]').astype('int')) # add a new column decay calculated from decay_date and views decay_rate = self._get_decay_rate() customer_interactions = customer_interactions.assign( decay=lambda x: x.views * np.exp(-decay_rate * x.decay_date)) # ==================== Aggregate and shape dataset ================= # # Group the dataset by customer_id, brand, gender sum the decay and rename it to views customer_interactions = customer_interactions.groupby(['customer_id', 'b_g']) \ .decay.sum() \ .rename('views') \ .reset_index() # remove unnecessary column customer_interactions = customer_interactions[[ 'customer_id', 'b_g', 'views' ]] # rename columns to match model expectation customer_interactions = customer_interactions.rename( columns={'customer_id': 'memberID'}) customer_interactions = customer_interactions.rename( columns={'views': 'total_hits'}) return customer_interactions
""" from xml.etree.ElementTree import parse from pandas import DataFrame, Series doc = parse('generated-data\patient-613876.fhir-bundle.xml') root = doc.getroot() """for item in doc.iterfind('feed/'): title = item.findtext('title') print title print doc """ #found = [element for element in doc.iter() if element.text == 'A'] #encounters = doc.findall('{http://hl7.org/fhir}Encounter') #print encounters encounter_dates = [] for encounter in doc.findall('.//{http://hl7.org/fhir}Encounter'): period = encounter.find('{http://hl7.org/fhir}period') start_date = period.find('{http://hl7.org/fhir}start') encounter_dates.append(start_date.get('value')) #print len(encounter_dates) enc_dates = DataFrame(encounter_dates, columns= ['date']) enc_dates.date = enc_dates.date.astype("datetime64") print enc_dates enc_dates.groupby([enc_dates.date.dt.week, enc_dates.date.dt.year]).count().plot(kind="barh")
def change_by_sum(data: DataFrame, metrics, min_sum_allowed=None, min_sum_sub=None) -> DataFrame: """ Parameters ---------- data metrics min_sum_allowed min_sum_sub All values in rolling sum that are smaller than ``min_sum_allowed`` are substituted with ``min_sum_sub``. The latter is expected to be smaller than the former to prevent conflicts. At the end of the process, all calculated columns carrying ``min_sum_sub``, including the metric column, are substituted with ``NaN`` - . Returns ------- """ metrics = set(metrics).intersection(data.columns) data.sort_values(["areaType", "areaCode", "date"], ascending=[True, True, True], inplace=True) logging.info(">> Starting to calculate the rolling metrics for") date_fmt = "%Y-%m-%d" date = "date" unique_loc_qualifiers = ["areaType", "areaCode"] unique_record_qualifiers = [*unique_loc_qualifiers, date] for col_name in metrics: rolling_sum_cols = [*unique_record_qualifiers, col_name] rolling_sum = f"{col_name}RollingSum" change = f"{col_name}Change" direction = f"{col_name}Direction" change_percentage = f"{col_name}ChangePercentage" # Local test # col_names.extend([col_name, rolling_sum, change, direction, change_percentage]) logging.info(f"\t{col_name}") d = data.loc[:, rolling_sum_cols] d.loc[:, col_name] = d.loc[:, col_name].astype(float) if rolling_sum not in data.columns: df_rsum = (d.loc[:, rolling_sum_cols].pipe( col2datetime, col=date, format=date_fmt).groupby(unique_loc_qualifiers).rolling( 7, on=date).sum().rename(columns={ col_name: rolling_sum }).reset_index( ).loc[:, [*unique_record_qualifiers, rolling_sum]].pipe( datetime2str, col=date, format=date_fmt).set_index(unique_record_qualifiers)) logging.info("\t\tCalculated rolling sum") try: data.date = data.date.map(lambda x: x.strftime(date_fmt)) except AttributeError: # Already string pass if min_sum_allowed is not None: df_rsum.loc[df_rsum[rolling_sum] < min_sum_allowed, rolling_sum] = min_sum_sub data = (data.set_index(unique_record_qualifiers).join( df_rsum, on=unique_record_qualifiers).reset_index()) logging.info("\t\tJoined rolling sum to dataset") data.loc[:, rolling_sum] = (data.groupby(unique_loc_qualifiers) [rolling_sum].apply(replace_all_zero)) logging.info(f"\t\tGrouped data by {unique_loc_qualifiers}") df_tmp = data.loc[:, [*unique_record_qualifiers, rolling_sum]] df_tmp = df_tmp.assign( **{ change: (df_tmp.pipe(col2datetime, col=date, format=date_fmt). loc[:, [*unique_loc_qualifiers, rolling_sum]].groupby( unique_loc_qualifiers).diff(periods=7)), direction: ( df_tmp.pipe(col2datetime, col=date, format=date_fmt). loc[:, [*unique_loc_qualifiers, rolling_sum]].groupby( unique_loc_qualifiers).diff( periods=7).pipe(get_directions, col=rolling_sum)) }) logging.info("\t\tCalculated rolling change (diff)") percentage_value = ( df_tmp.pipe( col2datetime, col=date, format=date_fmt).loc[:, [*unique_record_qualifiers, rolling_sum]]. groupby(unique_loc_qualifiers).rolling( window=8, on=date)[rolling_sum].apply(calculate_percentage_change).round( 1).to_frame(change_percentage)) logging.info("\t\tCalculated percentage change") df_tmp = (df_tmp.join( percentage_value, on=unique_record_qualifiers).pipe( datetime2str, col=date, format=date_fmt).set_index(unique_record_qualifiers). loc[:, [change, direction, change_percentage]]) logging.info("\t\tJoined percentage to other rolling figures") data = (data.join(df_tmp, on=unique_record_qualifiers).reset_index(drop=True)) logging.info("\t\tJoined rolling figures to main dataset") data.loc[data.loc[:, col_name].isnull(), [rolling_sum, change, direction, change_percentage]] = NaN logging.info("\t\tFinalised the data") if min_sum_allowed is not None: data.loc[ data[rolling_sum] == min_sum_sub, [rolling_sum, change, direction, change_percentage, col_name ]] = NaN return data
# <editor-fold desc="Description"> frmAllBus = frmTrnBus.append(frmTestBus) frmAllChk = frmTrnChk.append(frmTestChk) #get rid of unused del frmTrnChk;del frmTestChk;del frmTrnBus;del frmTestBus # </editor-fold> #------------------- #Data Cleaning #------------------- # <editor-fold desc="Description"> #convert any data types #Review Date - unicode data into datetime frmTrnRev.date = [datetime.strptime(date, '%Y-%m-%d') for date in frmTrnRev.date] frmTestRev.date = [datetime.strptime(date, '%Y-%m-%d') for date in frmTestRev.date] #Flatten any nested columns #business categories #user votes frmTrnUser['votes_cool'] = [rec['cool'] for rec in frmTrnUser.votes] frmTrnUser['votes_funny'] = [rec['funny'] for rec in frmTrnUser.votes] frmTrnUser['votes_useful'] = [rec['useful'] for rec in frmTrnUser.votes] #review votes frmTrnRev['votes_cool'] = [rec['cool'] for rec in frmTrnRev.votes] frmTrnRev['votes_funny'] = [rec['funny'] for rec in frmTrnRev.votes] frmTrnRev['votes_useful'] = [rec['useful'] for rec in frmTrnRev.votes] #Other misc cleaning